Adapt to 0.1.0

0a21fff9 · xiabo · 9484fd1c · 0a21fff9 · 0a21fff9 · 0a21fff9
Commit 0a21fff9 authored Dec 20, 2023 by xiabo
20 changed files
--- a/3rdparty/core-r22.12/cmake/TritonCoreConfig.cmake.in
+++ b/3rdparty/core-r22.12/cmake/TritonCoreConfig.cmake.in
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  TRITONCORE_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONCORE_CMAKE_DIR})
+
+if(NOT TARGET TritonCore::triton-core-serverapi)
+  include("${TRITONCORE_CMAKE_DIR}/TritonCoreTargets.cmake")
+endif()
--- a/3rdparty/core-r22.12/include/triton/core/tritonbackend.h
+++ b/3rdparty/core-r22.12/include/triton/core/tritonbackend.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "triton/core/tritonserver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _COMPILING_TRITONBACKEND
+#if defined(_MSC_VER)
+#define TRITONBACKEND_DECLSPEC __declspec(dllexport)
+#define TRITONBACKEND_ISPEC __declspec(dllimport)
+#elif defined(__GNUC__)
+#define TRITONBACKEND_DECLSPEC __attribute__((__visibility__("default")))
+#define TRITONBACKEND_ISPEC
+#else
+#define TRITONBACKEND_DECLSPEC
+#define TRITONBACKEND_ISPEC
+#endif
+#else
+#if defined(_MSC_VER)
+#define TRITONBACKEND_DECLSPEC __declspec(dllimport)
+#define TRITONBACKEND_ISPEC __declspec(dllexport)
+#else
+#define TRITONBACKEND_DECLSPEC
+#define TRITONBACKEND_ISPEC
+#endif
+#endif
+
+struct TRITONBACKEND_MemoryManager;
+struct TRITONBACKEND_Input;
+struct TRITONBACKEND_Output;
+struct TRITONBACKEND_State;
+struct TRITONBACKEND_Request;
+struct TRITONBACKEND_ResponseFactory;
+struct TRITONBACKEND_Response;
+struct TRITONBACKEND_Backend;
+struct TRITONBACKEND_Model;
+struct TRITONBACKEND_ModelInstance;
+struct TRITONBACKEND_BackendAttribute;
+
+///
+/// TRITONBACKEND API Version
+///
+/// The TRITONBACKEND API is versioned with major and minor version
+/// numbers. Any change to the API that does not impact backwards
+/// compatibility (for example, adding a non-required function)
+/// increases the minor version number. Any change that breaks
+/// backwards compatibility (for example, deleting or changing the
+/// behavior of a function) increases the major version number. A
+/// backend should check that the API version used to compile the
+/// backend is compatible with the API version of the Triton server
+/// that it is running in. This is typically done by code similar to
+/// the following which makes sure that the major versions are equal
+/// and that the minor version of Triton is >= the minor version used
+/// to build the backend.
+///
+///   uint32_t api_version_major, api_version_minor;
+///   TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor);
+///   if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
+///       (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+///     return TRITONSERVER_ErrorNew(
+///       TRITONSERVER_ERROR_UNSUPPORTED,
+///       "triton backend API version does not support this backend");
+///   }
+///
+#define TRITONBACKEND_API_VERSION_MAJOR 1
+#define TRITONBACKEND_API_VERSION_MINOR 10
+
+/// Get the TRITONBACKEND API version supported by Triton. This value
+/// can be compared against the TRITONBACKEND_API_VERSION_MAJOR and
+/// TRITONBACKEND_API_VERSION_MINOR used to build the backend to
+/// ensure that Triton is compatible with the backend.
+///
+/// \param major Returns the TRITONBACKEND API major version supported
+/// by Triton.
+/// \param minor Returns the TRITONBACKEND API minor version supported
+/// by Triton.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ApiVersion(
+    uint32_t* major, uint32_t* minor);
+
+/// TRITONBACKEND_ArtifactType
+///
+/// The ways that the files that make up a backend or model are
+/// communicated to the backend.
+///
+///   TRITONBACKEND_ARTIFACT_FILESYSTEM: The model or backend
+///     artifacts are made available to Triton via a locally
+///     accessible filesystem. The backend can access these files
+///     using an appropriate system API.
+///
+typedef enum TRITONBACKEND_artifacttype_enum {
+  TRITONBACKEND_ARTIFACT_FILESYSTEM
+} TRITONBACKEND_ArtifactType;
+
+
+///
+/// TRITONBACKEND_MemoryManager
+///
+/// Object representing an memory manager that is capable of
+/// allocating and otherwise managing different memory types. For
+/// improved performance Triton maintains pools for GPU and CPU-pinned
+/// memory and the memory manager allows backends to access those
+/// pools.
+///
+
+/// Allocate a contiguous block of memory of a specific type using a
+/// memory manager. Two error codes have specific interpretations for
+/// this function:
+///
+///   TRITONSERVER_ERROR_UNSUPPORTED: Indicates that Triton is
+///     incapable of allocating the requested memory type and memory
+///     type ID. Requests for the memory type and ID will always fail
+///     no matter 'byte_size' of the request.
+///
+///   TRITONSERVER_ERROR_UNAVAILABLE: Indicates that Triton can
+///      allocate the memory type and ID but that currently it cannot
+///      allocate a contiguous block of memory of the requested
+///      'byte_size'.
+///
+/// \param manager The memory manager.
+/// \param buffer Returns the allocated memory.
+/// \param memory_type The type of memory to allocate.
+/// \param memory_type_id The ID associated with the memory type to
+/// allocate. For GPU memory this indicates the device ID of the GPU
+/// to allocate from.
+/// \param byte_size The size of memory to allocate, in bytes.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_MemoryManagerAllocate(
+    TRITONBACKEND_MemoryManager* manager, void** buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
+    const uint64_t byte_size);
+
+/// Free a buffer that was previously allocated with
+/// TRITONBACKEND_MemoryManagerAllocate. The call must provide the
+/// same values for 'memory_type' and 'memory_type_id' as were used
+/// when the buffer was allocate or else the behavior is undefined.
+///
+/// \param manager The memory manager.
+/// \param buffer The allocated memory buffer to free.
+/// \param memory_type The type of memory of the buffer.
+/// \param memory_type_id The ID associated with the memory type of
+/// the buffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_MemoryManagerFree(
+    TRITONBACKEND_MemoryManager* manager, void* buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
+
+///
+/// TRITONBACKEND_Input
+///
+/// Object representing an input tensor.
+///
+
+/// Get the name and properties of an input tensor. The returned
+/// strings and other properties are owned by the input, not the
+/// caller, and so should not be modified or freed.
+///
+/// \param input The input tensor.
+/// \param name If non-nullptr, returns the tensor name.
+/// \param datatype If non-nullptr, returns the tensor datatype.
+/// \param shape If non-nullptr, returns the tensor shape.
+/// \param dim_count If non-nullptr, returns the number of dimensions
+/// in the tensor shape.
+/// \param byte_size If non-nullptr, returns the size of the available
+/// data for the tensor, in bytes. This size reflects the actual data
+/// available, and does not necessarily match what is
+/// expected/required for the tensor given its shape and datatype. It
+/// is the responsibility of the backend to handle mismatches in these
+/// sizes appropriately.
+/// \param buffer_count If non-nullptr, returns the number of buffers
+/// holding the contents of the tensor. These buffers are accessed
+/// using TRITONBACKEND_InputBuffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputProperties(
+    TRITONBACKEND_Input* input, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count);
+
+/// Get the name and properties of an input tensor associated with a given
+/// host policy. If there are no input buffers for the specified  host policy,
+/// the properties of the fallback input buffers are returned. The returned
+/// strings and other properties are owned by the input, not the caller, and so
+/// should not be modified or freed.
+///
+/// \param input The input tensor.
+/// \param host_policy_name The host policy name. Fallback input properties
+/// will be return if nullptr is provided.
+/// \param name If non-nullptr, returns the tensor name.
+/// \param datatype If non-nullptr, returns the tensor datatype.
+/// \param shape If non-nullptr, returns the tensor shape.
+/// \param dim_count If non-nullptr, returns the number of dimensions
+/// in the tensor shape.
+/// \param byte_size If non-nullptr, returns the size of the available
+/// data for the tensor, in bytes. This size reflects the actual data
+/// available, and does not necessarily match what is
+/// expected/required for the tensor given its shape and datatype. It
+/// is the responsibility of the backend to handle mismatches in these
+/// sizes appropriately.
+/// \param buffer_count If non-nullptr, returns the number of buffers
+/// holding the contents of the tensor. These buffers are accessed
+/// using TRITONBACKEND_InputBufferForHostPolicy.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputPropertiesForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count);
+
+/// Get a buffer holding (part of) the tensor data for an input. For a
+/// given input the number of buffers composing the input are found
+/// from 'buffer_count' returned by TRITONBACKEND_InputProperties. The
+/// returned buffer is owned by the input and so should not be
+/// modified or freed by the caller. The lifetime of the buffer
+/// matches that of the input and so the buffer should not be accessed
+/// after the input tensor object is released.
+///
+/// \param input The input tensor.
+/// \param index The index of the buffer. Must be 0 <= index <
+/// buffer_count, where buffer_count is the value returned by
+/// TRITONBACKEND_InputProperties.
+/// \param buffer Returns a pointer to a contiguous block of data for
+/// the named input.
+/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
+/// \param memory_type Acts as both input and output. On input gives
+/// the buffer memory type preferred by the function caller.  Returns
+/// the actual memory type of 'buffer'.
+/// \param memory_type_id Acts as both input and output. On input
+/// gives the buffer memory type id preferred by the function caller.
+/// Returns the actual memory type id of 'buffer'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputBuffer(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id);
+
+/// Get a buffer holding (part of) the tensor data for an input for a specific
+/// host policy. If there are no input buffers specified for this host policy,
+/// the fallback input buffer is returned.
+/// For a given input the number of buffers composing the input are found
+/// from 'buffer_count' returned by TRITONBACKEND_InputPropertiesForHostPolicy.
+/// The returned buffer is owned by the input and so should not be modified or
+/// freed by the caller. The lifetime of the buffer matches that of the input
+/// and so the buffer should not be accessed after the input tensor object is
+/// released.
+///
+/// \param input The input tensor.
+/// \param host_policy_name The host policy name. Fallback input buffer
+/// will be return if nullptr is provided.
+/// \param index The index of the buffer. Must be 0 <= index <
+/// buffer_count, where buffer_count is the value returned by
+/// TRITONBACKEND_InputPropertiesForHostPolicy.
+/// \param buffer Returns a pointer to a contiguous block of data for
+/// the named input.
+/// \param buffer_byte_size Returns the size, in bytes, of 'buffer'.
+/// \param memory_type Acts as both input and output. On input gives
+/// the buffer memory type preferred by the function caller.  Returns
+/// the actual memory type of 'buffer'.
+/// \param memory_type_id Acts as both input and output. On input
+/// gives the buffer memory type id preferred by the function caller.
+/// Returns the actual memory type id of 'buffer'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBufferForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name,
+    const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+
+/// Get the buffer attributes associated with the given input buffer. For a
+/// given input the number of buffers composing the input are found from
+/// 'buffer_count' returned by TRITONBACKEND_InputProperties. The returned
+/// 'buffer_attributes' is owned by the input and so should not be modified or
+/// freed by the caller. The lifetime of the 'buffer_attributes' matches that of
+/// the input and so the 'buffer_attributes' should not be accessed after the
+/// input tensor object is released.
+///
+/// \param input The input tensor.
+/// \param index The index of the buffer. Must be 0 <= index < buffer_count,
+/// where buffer_count is the value returned by TRITONBACKEND_InputProperties.
+/// \param buffer Returns a pointer to a contiguous block of data for
+/// the named input.
+/// \param buffer_attributes Returns the attributes for the given buffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_InputBufferAttributes(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    TRITONSERVER_BufferAttributes** buffer_attributes);
+
+///
+/// TRITONBACKEND_Output
+///
+/// Object representing a response output tensor.
+///
+
+/// Get a buffer to use to hold the tensor data for the output. The
+/// returned buffer is owned by the output and so should not be freed
+/// by the caller. The caller can and should fill the buffer with the
+/// output data for the tensor. The lifetime of the buffer matches
+/// that of the output and so the buffer should not be accessed after
+/// the output tensor object is released.
+///
+/// \param buffer Returns a pointer to a buffer where the contents of
+/// the output tensor should be placed.
+/// \param buffer_byte_size The size, in bytes, of the buffer required
+/// by the caller.
+/// \param memory_type Acts as both input and output. On input gives
+/// the buffer memory type preferred by the caller.  Returns the
+/// actual memory type of 'buffer'.
+/// \param memory_type_id Acts as both input and output. On input
+/// gives the buffer memory type id preferred by the caller. Returns
+/// the actual memory type id of 'buffer'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_OutputBuffer(
+    TRITONBACKEND_Output* output, void** buffer,
+    const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id);
+
+/// Get the buffer attributes associated with the given output buffer. The
+/// returned 'buffer_attributes' is owned by the output and so should not be
+/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
+/// matches that of the output and so the 'buffer_attributes' should not be
+/// accessed after the output tensor object is released. This function must be
+/// called after the TRITONBACKEND_OutputBuffer otherwise it might contain
+/// incorrect data.
+///
+/// \param output The output tensor.
+/// \param buffer_attributes Returns the attributes for the output buffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_OutputBufferAttributes(
+    TRITONBACKEND_Output* output,
+    TRITONSERVER_BufferAttributes** buffer_attributes);
+
+///
+/// TRITONBACKEND_Request
+///
+/// Object representing an inference request.
+///
+
+/// Get the ID of the request. Can be nullptr if request doesn't have
+/// an ID. The returned string is owned by the request, not the
+/// caller, and so should not be modified or freed.
+///
+/// \param request The inference request.
+/// \param id Returns the ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestId(
+    TRITONBACKEND_Request* request, const char** id);
+
+/// Get the correlation ID of the request if it is an unsigned integer.
+/// Zero indicates that the request does not have a correlation ID.
+/// Returns failure if correlation ID for given request is not an unsigned
+/// integer.
+///
+/// \param request The inference request.
+/// \param id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestCorrelationId(
+    TRITONBACKEND_Request* request, uint64_t* id);
+
+/// Get the correlation ID of the request if it is a string.
+/// Empty string indicates that the request does not have a correlation ID.
+/// Returns error if correlation ID for given request is not a string.
+///
+/// \param request The inference request.
+/// \param id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestCorrelationIdString(
+    TRITONBACKEND_Request* request, const char** id);
+
+/// Get the flag(s) associated with a request. On return 'flags' holds
+/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
+/// available flags.
+///
+/// \param request The inference request.
+/// \param flags Returns the flags.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestFlags(
+    TRITONBACKEND_Request* request, uint32_t* flags);
+
+/// Get the number of input tensors specified in the request.
+///
+/// \param request The inference request.
+/// \param count Returns the number of input tensors.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputCount(
+    TRITONBACKEND_Request* request, uint32_t* count);
+
+/// Get the name of an input tensor. The caller does not own
+/// the returned string and must not modify or delete it. The lifetime
+/// of the returned string extends only as long as 'request'.
+///
+/// \param request The inference request.
+/// \param index The index of the input tensor. Must be 0 <= index <
+/// count, where count is the value returned by
+/// TRITONBACKEND_RequestInputCount.
+/// \param input_name Returns the name of the input tensor
+/// corresponding to the index.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** input_name);
+
+/// Get a named request input. The lifetime of the returned input
+/// object matches that of the request and so the input object should
+/// not be accessed after the request object is released.
+///
+/// \param request The inference request.
+/// \param name The name of the input.
+/// \param input Returns the input corresponding to the name.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInput(
+    TRITONBACKEND_Request* request, const char* name,
+    TRITONBACKEND_Input** input);
+
+/// Get a request input by index. The order of inputs in a given
+/// request is not necessarily consistent with other requests, even if
+/// the requests are in the same batch. As a result, you can not
+/// assume that an index obtained from one request will point to the
+/// same input in a different request.
+///
+/// The lifetime of the returned input object matches that of the
+/// request and so the input object should not be accessed after the
+/// request object is released.
+///
+/// \param request The inference request.
+/// \param index The index of the input tensor. Must be 0 <= index <
+/// count, where count is the value returned by
+/// TRITONBACKEND_RequestInputCount.
+/// \param input Returns the input corresponding to the index.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestInputByIndex(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    TRITONBACKEND_Input** input);
+
+/// Get the number of output tensors requested to be returned in the
+/// request.
+///
+/// \param request The inference request.
+/// \param count Returns the number of output tensors.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestOutputCount(
+    TRITONBACKEND_Request* request, uint32_t* count);
+
+/// Get the name of a requested output tensor. The caller does not own
+/// the returned string and must not modify or delete it. The lifetime
+/// of the returned string extends only as long as 'request'.
+///
+/// \param request The inference request.
+/// \param index The index of the requested output tensor. Must be 0
+/// <= index < count, where count is the value returned by
+/// TRITONBACKEND_RequestOutputCount.
+/// \param output_name Returns the name of the requested output tensor
+/// corresponding to the index.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestOutputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** output_name);
+
+/// Returns the preferred memory type and memory type ID of the output buffer
+/// for the request. As much as possible, Triton will attempt to return
+/// the same memory_type and memory_type_id values that will be returned by
+/// the subsequent call to TRITONBACKEND_OutputBuffer, however, the backend must
+/// be capable of handling cases where the values differ.
+///
+/// \param request The request.
+/// \param name The name of the output tensor. This is optional
+/// and it should be set to nullptr to indicate that the tensor name has
+/// not determined.
+/// \param byte_size The expected size of the buffer. This is optional
+/// and it should be set to nullptr to indicate that the byte size has
+/// not determined.
+/// \param memory_type Acts as both input and output. On input gives
+/// the memory type preferred by the caller. Returns memory type preferred
+/// by Triton, taken account of the caller preferred type.
+/// \param memory_type_id Acts as both input and output. On input gives
+/// the memory type ID preferred by the caller. Returns memory type ID preferred
+/// by Triton, taken account of the caller preferred type ID.
+/// \return a TRITONSERVER_Error object if a failure occurs.
+/// A TRITONSERVER_ERROR_UNAVAILABLE error indicates that the properties are not
+/// available, other error codes indicate an error.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputBufferProperties(
+    TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+
+/// Release the request. The request should be released when it is no
+/// longer needed by the backend. If this call returns with an error
+/// (i.e. non-nullptr) then the request was not released and ownership
+/// remains with the backend. If this call returns with success, the
+/// 'request' object is no longer owned by the backend and must not be
+/// used. Any tensor names, data types, shapes, input tensors,
+/// etc. returned by TRITONBACKEND_Request* functions for this request
+/// are no longer valid. If a persistent copy of that data is required
+/// it must be created before calling this function.
+///
+/// \param request The inference request.
+/// \param release_flags Flags indicating what type of request release
+/// should be performed. \see TRITONSERVER_RequestReleaseFlag. \see
+/// TRITONSERVER_InferenceRequestReleaseFn_t.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_RequestRelease(
+    TRITONBACKEND_Request* request, uint32_t release_flags);
+
+///
+/// TRITONBACKEND_ResponseFactory
+///
+/// Object representing an inference response factory. Using a
+/// response factory is not required; instead a response can be
+/// generated directly from a TRITONBACKEND_Request object using
+/// TRITONBACKEND_ResponseNew(). A response factory allows a request
+/// to be released before all responses have been sent. Releasing a
+/// request as early as possible releases all input tensor data and
+/// therefore may be desirable in some cases.
+
+/// Create the response factory associated with a request.
+///
+/// \param factory Returns the new response factory.
+/// \param request The inference request.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseFactoryNew(
+    TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request);
+
+/// Destroy a response factory.
+///
+/// \param factory The response factory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseFactoryDelete(
+    TRITONBACKEND_ResponseFactory* factory);
+
+/// Send response flags without a corresponding response.
+///
+/// \param factory The response factory.
+/// \param send_flags Flags to send. \see
+/// TRITONSERVER_ResponseCompleteFlag. \see
+/// TRITONSERVER_InferenceResponseCompleteFn_t.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactorySendFlags(
+    TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags);
+
+///
+/// TRITONBACKEND_Response
+///
+/// Object representing an inference response. For a given request,
+/// the backend must carefully manage the lifecycle of responses
+/// generated for that request to ensure that the output tensor
+/// buffers are allocated correctly. When a response is created with
+/// TRITONBACKEND_ResponseNew or TRITONBACKEND_ResponseNewFromFactory,
+/// all the outputs and corresponding buffers must be created for that
+/// response using TRITONBACKEND_ResponseOutput and
+/// TRITONBACKEND_OutputBuffer *before* another response is created
+/// for the request. For a given response, outputs can be created in
+/// any order but they must be created sequentially/sychronously (for
+/// example, the backend cannot use multiple threads to simultaneously
+/// add multiple outputs to a response).
+///
+/// The above requirement applies only to responses being generated
+/// for a given request. The backend may generate responses in
+/// parallel on multiple threads as long as those responses are for
+/// different requests.
+///
+/// This order of response creation must be strictly followed. But,
+/// once response(s) are created they do not need to be sent
+/// immediately, nor do they need to be sent in the order they were
+/// created. The backend may even delete a created response instead of
+/// sending it by using TRITONBACKEND_ResponseDelete.
+
+/// Create a response for a request.
+///
+/// \param response Returns the new response.
+/// \param request The request.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseNew(
+    TRITONBACKEND_Response** response, TRITONBACKEND_Request* request);
+
+/// Create a response using a factory.
+///
+/// \param response Returns the new response.
+/// \param factory The response factory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseNewFromFactory(
+    TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory);
+
+/// Destroy a response. It is not necessary to delete a response if
+/// TRITONBACKEND_ResponseSend is called as that function transfers
+/// ownership of the response object to Triton.
+///
+/// \param response The response.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseDelete(
+    TRITONBACKEND_Response* response);
+
+/// Set a string parameter in the response.
+///
+/// \param response The response.
+/// \param name The name of the parameter.
+/// \param value The value of the parameter.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetStringParameter(
+    TRITONBACKEND_Response* response, const char* name, const char* value);
+
+/// Set an integer parameter in the response.
+///
+/// \param response The response.
+/// \param name The name of the parameter.
+/// \param value The value of the parameter.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetIntParameter(
+    TRITONBACKEND_Response* response, const char* name, const int64_t value);
+
+/// Set an boolean parameter in the response.
+///
+/// \param response The response.
+/// \param name The name of the parameter.
+/// \param value The value of the parameter.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetBoolParameter(
+    TRITONBACKEND_Response* response, const char* name, const bool value);
+
+/// Create an output tensor in the response. The lifetime of the
+/// returned output tensor object matches that of the response and so
+/// the output tensor object should not be accessed after the response
+/// object is deleted.
+///
+/// \param response The response.
+/// \param output Returns the new response output.
+/// \param name The name of the output tensor.
+/// \param datatype The datatype of the output tensor.
+/// \param shape The shape of the output tensor.
+/// \param dims_count The number of dimensions in the output tensor
+/// shape.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseOutput(
+    TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count);
+
+/// Send a response. Calling this function transfers ownership of the
+/// response object to Triton. The caller must not access or delete
+/// the response object after calling this function.
+///
+/// \param response The response.
+/// \param send_flags Flags associated with the response. \see
+/// TRITONSERVER_ResponseCompleteFlag. \see
+/// TRITONSERVER_InferenceResponseCompleteFn_t.
+/// \param error The TRITONSERVER_Error to send if the response is an
+/// error, or nullptr if the response is successful.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ResponseSend(
+    TRITONBACKEND_Response* response, const uint32_t send_flags,
+    TRITONSERVER_Error* error);
+
+///
+/// TRITONBACKEND_State
+///
+/// Object representing a state.
+///
+
+/// Create a state in the request. The returned state object is only valid
+/// before the TRITONBACKEND_StateUpdate is called. The state should not be
+/// freed by the caller. If TRITONBACKEND_StateUpdate is not called, the
+/// lifetime of the state matches the lifetime of the request. If the state name
+/// does not exist in the "state" section of the model configuration, the state
+/// will not be created and an error will be returned. If this function is
+/// called when sequence batching is not enabled or there is no 'states' section
+/// in the sequence batching section of the model configuration, this call will
+/// return an error.
+///
+/// \param state Returns the new state.
+/// \param request The request.
+/// \param name The name of the state.
+/// \param datatype The datatype of the state.
+/// \param shape The shape of the state.
+/// \param dims_count The number of dimensions in the state shape.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateNew(
+    TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count);
+
+/// Update the state for the sequence. Calling this function will replace the
+/// state stored for this seqeunce in Triton with 'state' provided in the
+/// function argument. If this function is called when sequence batching is not
+/// enabled or there is no 'states' section in the sequence batching section of
+/// the model configuration, this call will return an error. The backend is not
+/// required to call this function. If the backend doesn't call
+/// TRITONBACKEND_StateUpdate function, this particular state for the sequence
+/// will not be updated and the next inference request in the sequence will use
+/// the same state as the current inference request.
+///
+/// \param state The state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateUpdate(
+    TRITONBACKEND_State* state);
+
+/// Get a buffer to use to hold the tensor data for the state. The returned
+/// buffer is owned by the state and so should not be freed by the caller. The
+/// caller can and should fill the buffer with the state data. The buffer must
+/// not be accessed by the backend after TRITONBACKEND_StateUpdate is called.
+/// The caller should fill the buffer before calling TRITONBACKEND_StateUpdate.
+///
+/// \param state The state.
+/// \param buffer Returns a pointer to a buffer where the contents of the state
+/// should be placed.
+/// \param buffer_byte_size The size, in bytes, of the buffer required
+/// by the caller.
+/// \param memory_type Acts as both input and output. On input gives
+/// the buffer memory type preferred by the caller.  Returns the
+/// actual memory type of 'buffer'.
+/// \param memory_type_id Acts as both input and output. On input
+/// gives the buffer memory type id preferred by the caller. Returns
+/// the actual memory type id of 'buffer'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateBuffer(
+    TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+
+/// Get the buffer attributes associated with the given state buffer.
+/// The returned 'buffer_attributes' is owned by the state and so should not be
+/// modified or freed by the caller. The lifetime of the 'buffer_attributes'
+/// matches that of the state.
+///
+/// \param state The state.
+/// \param buffer_attributes Returns the buffer attributes for the given state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_StateBufferAttributes(
+    TRITONBACKEND_State* state,
+    TRITONSERVER_BufferAttributes** buffer_attributes);
+
+///
+/// TRITONBACKEND_Backend
+///
+/// Object representing a backend.
+///
+
+/// TRITONBACKEND_ExecutionPolicy
+///
+/// Types of execution policy that can be implemented by a backend.
+///
+///   TRITONBACKEND_EXECUTION_BLOCKING: An instance of the model
+///     blocks in TRITONBACKEND_ModelInstanceExecute until it is ready
+///     to handle another inference. Upon returning from
+///     TRITONBACKEND_ModelInstanceExecute, Triton may immediately
+///     call TRITONBACKEND_ModelInstanceExecute for the same instance
+///     to execute a new batch of requests. Thus, most backends using
+///     this policy will not return from
+///     TRITONBACKEND_ModelInstanceExecute until all responses have
+///     been sent and all requests have been released. This is the
+///     default execution policy.
+///
+///   TRITONBACKEND_EXECUTION_DEVICE_BLOCKING: An instance, A, of the
+///     model blocks in TRITONBACKEND_ModelInstanceExecute if the
+///     device associated with the instance is unable to handle
+///     another inference. Even if another instance, B, associated
+///     with the device, is available and ready to perform an
+///     inference, Triton will not invoke
+///     TRITONBACKEND_ModeInstanceExecute for B until A returns from
+///     TRITONBACKEND_ModelInstanceExecute. Triton will not be blocked
+///     from calling TRITONBACKEND_ModelInstanceExecute for instance
+///     C, which is associated with a different device than A and B,
+///     even if A or B has not returned from
+///     TRITONBACKEND_ModelInstanceExecute. This execution policy is
+///     typically used by a backend that can cooperatively execute
+///     multiple model instances on the same device.
+///
+typedef enum TRITONBACKEND_execpolicy_enum {
+  TRITONBACKEND_EXECUTION_BLOCKING,
+  TRITONBACKEND_EXECUTION_DEVICE_BLOCKING
+} TRITONBACKEND_ExecutionPolicy;
+
+/// Get the name of the backend. The caller does not own the returned
+/// string and must not modify or delete it. The lifetime of the
+/// returned string extends only as long as 'backend'.
+///
+/// \param backend The backend.
+/// \param name Returns the name of the backend.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendName(
+    TRITONBACKEND_Backend* backend, const char** name);
+
+/// Get the backend configuration.  The 'backend_config' message is
+/// owned by Triton and should not be modified or freed by the caller.
+///
+/// The backend configuration, as JSON, is:
+///
+///   {
+///     "cmdline" : {
+///       "<setting>" : "<value>",
+///       ...
+///     }
+///   }
+///
+/// \param backend The backend.
+/// \param backend_config Returns the backend configuration as a message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendConfig(
+    TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config);
+
+/// Get the execution policy for this backend. By default the
+/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING.
+///
+/// \param backend The backend.
+/// \param policy Returns the execution policy.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy);
+
+/// Set the execution policy for this backend. By default the
+/// execution policy is TRITONBACKEND_EXECUTION_BLOCKING. Triton reads
+/// the backend's execution policy after calling
+/// TRITONBACKEND_Initialize, so to be recognized changes to the
+/// execution policy must be made in TRITONBACKEND_Initialize.
+/// Also, note that if using sequence batcher for the model, Triton will
+/// use TRITONBACKEND_EXECUTION_BLOCKING policy irrespective of the
+/// policy specified by this setter function.
+///
+/// \param backend The backend.
+/// \param policy The execution policy.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendSetExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy);
+
+/// Get the location of the files that make up the backend
+/// implementation. This location contains the backend shared library
+/// and any other files located with the shared library. The
+/// 'location' communicated depends on how the backend is being
+/// communicated to Triton as indicated by 'artifact_type'.
+///
+///   TRITONBACKEND_ARTIFACT_FILESYSTEM: The backend artifacts are
+///     made available to Triton via the local filesytem. 'location'
+///     returns the full path to the directory containing this
+///     backend's artifacts. The returned string is owned by Triton,
+///     not the caller, and so should not be modified or freed.
+///
+/// \param backend The backend.
+/// \param artifact_type Returns the artifact type for the backend.
+/// \param path Returns the location.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendArtifacts(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location);
+
+/// Get the memory manager associated with a backend.
+///
+/// \param backend The backend.
+/// \param manager Returns the memory manager.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendMemoryManager(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager);
+
+/// Get the user-specified state associated with the backend. The
+/// state is completely owned and managed by the backend.
+///
+/// \param backend The backend.
+/// \param state Returns the user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendState(
+    TRITONBACKEND_Backend* backend, void** state);
+
+/// Set the user-specified state associated with the backend. The
+/// state is completely owned and managed by the backend.
+///
+/// \param backend The backend.
+/// \param state The user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_BackendSetState(
+    TRITONBACKEND_Backend* backend, void* state);
+
+///
+/// TRITONBACKEND_Model
+///
+/// Object representing a model implemented using the backend.
+///
+
+/// Get the name of the model. The returned string is owned by the
+/// model object, not the caller, and so should not be modified or
+/// freed.
+///
+/// \param model The model.
+/// \param name Returns the model name.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelName(
+    TRITONBACKEND_Model* model, const char** name);
+
+/// Get the version of the model.
+///
+/// \param model The model.
+/// \param version Returns the model version.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelVersion(
+    TRITONBACKEND_Model* model, uint64_t* version);
+
+/// Get the location of the files that make up the model. The
+/// 'location' communicated depends on how the model is being
+/// communicated to Triton as indicated by 'artifact_type'.
+///
+///   TRITONBACKEND_ARTIFACT_FILESYSTEM: The model artifacts are made
+///     available to Triton via the local filesytem. 'location'
+///     returns the full path to the directory in the model repository
+///     that contains this model's artifacts. The returned string is
+///     owned by Triton, not the caller, and so should not be modified
+///     or freed.
+///
+/// \param model The model.
+/// \param artifact_type Returns the artifact type for the model.
+/// \param path Returns the location.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelRepository(
+    TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location);
+
+/// Get the model configuration. The caller takes ownership of the
+/// message object and must call TRITONSERVER_MessageDelete to release
+/// the object. The configuration is available via this call even
+/// before the model is loaded and so can be used in
+/// TRITONBACKEND_ModelInitialize. TRITONSERVER_ServerModelConfig
+/// returns equivalent information but is not useable until after the
+/// model loads.
+///
+/// \param model The model.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned. Currently only version 1 is supported.
+/// \param model_config Returns the model configuration as a message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message** model_config);
+
+/// Whether the backend should attempt to auto-complete the model configuration.
+/// If true, the model should fill the inputs, outputs, and max batch size in
+/// the model configuration if incomplete. If the model configuration is
+/// changed,  the new configuration must be reported to Triton using
+/// TRITONBACKEND_ModelSetConfig.
+///
+/// \param model The model.
+/// \param auto_complete_config Returns whether the backend should auto-complete
+/// the model configuration.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelAutoCompleteConfig(
+    TRITONBACKEND_Model* model, bool* auto_complete_config);
+
+/// Set the model configuration in Triton server. This API should only be called
+/// when the backend implements the auto-completion of model configuration
+/// and TRITONBACKEND_ModelAutoCompleteConfig returns true in
+/// auto_complete_config. Only the inputs, outputs, max batch size, and
+/// scheduling choice can be changed. A caveat being scheduling choice can only
+/// be changed if none is previously set. Any other changes to the model
+/// configuration will be ignored by Triton. This function can only be called
+/// from TRITONBACKEND_ModelInitialize, calling in any other context will result
+/// in an error being returned. Additionally, Triton server can add some of the
+/// missing fields in the provided config with this call. The backend must get
+/// the complete configuration again by using TRITONBACKEND_ModelConfig.
+/// TRITONBACKEND_ModelSetConfig does not take ownership of the message object
+/// and so the caller should call TRITONSERVER_MessageDelete to release the
+/// object once the function returns.
+///
+/// \param model The model.
+/// \param config_version The format version of the model configuration.
+/// If the configuration is not represented in the version's format
+/// then an error will be returned. Currently only version 1 is supported.
+/// \param model_config The updated model configuration as a message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelSetConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message* model_config);
+
+/// Get the TRITONSERVER_Server object that this model is being served
+/// by.
+///
+/// \param model The model.
+/// \param server Returns the server.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelServer(
+    TRITONBACKEND_Model* model, TRITONSERVER_Server** server);
+
+/// Get the backend used by the model.
+///
+/// \param model The model.
+/// \param model Returns the backend object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelBackend(
+    TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend);
+
+/// Get the user-specified state associated with the model. The
+/// state is completely owned and managed by the backend.
+///
+/// \param model The model.
+/// \param state Returns the user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelState(
+    TRITONBACKEND_Model* model, void** state);
+
+/// Set the user-specified state associated with the model. The
+/// state is completely owned and managed by the backend.
+///
+/// \param model The model.
+/// \param state The user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelSetState(
+    TRITONBACKEND_Model* model, void* state);
+
+///
+/// TRITONBACKEND_ModelInstance
+///
+/// Object representing a model instance implemented using the
+/// backend.
+///
+
+/// Get the name of the model instance. The returned string is owned by the
+/// model object, not the caller, and so should not be modified or
+/// freed.
+///
+/// \param instance The model instance.
+/// \param name Returns the instance name.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceName(
+    TRITONBACKEND_ModelInstance* instance, const char** name);
+
+/// Get the kind of the model instance.
+///
+/// \param instance The model instance.
+/// \param kind Returns the instance kind.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceKind(
+    TRITONBACKEND_ModelInstance* instance,
+    TRITONSERVER_InstanceGroupKind* kind);
+
+/// Get the device ID of the model instance.
+///
+/// \param instance The model instance.
+/// \param device_id Returns the instance device ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceDeviceId(
+    TRITONBACKEND_ModelInstance* instance, int32_t* device_id);
+
+/// Get the host policy setting.  The 'host_policy' message is
+/// owned by Triton and should not be modified or freed by the caller.
+///
+/// The host policy setting, as JSON, is:
+///
+///   {
+///     "<host_policy>" : {
+///       "<setting>" : "<value>",
+///       ...
+///     }
+///   }
+///
+/// \param instance The model instance.
+/// \param host_policy Returns the host policy setting as a message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceHostPolicy(
+    TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy);
+
+/// Whether the model instance is passive.
+///
+/// \param instance The model instance.
+/// \param is_passive Returns true if the instance is passive, false otherwise
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceIsPassive(
+    TRITONBACKEND_ModelInstance* instance, bool* is_passive);
+
+/// Get the number of optimization profiles to be loaded for the instance.
+///
+/// \param instance The model instance.
+/// \param count Returns the number of optimization profiles.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count);
+
+/// Get the name of optimization profile. The caller does not own
+/// the returned string and must not modify or delete it. The lifetime
+/// of the returned string extends only as long as 'instance'.
+///
+/// \param instance The model instance.
+/// \param index The index of the optimization profile. Must be 0
+/// <= index < count, where count is the value returned by
+/// TRITONBACKEND_ModelInstanceProfileCount.
+/// \param profile_name Returns the name of the optimization profile
+/// corresponding to the index.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileName(
+    TRITONBACKEND_ModelInstance* instance, const uint32_t index,
+    const char** profile_name);
+
+/// Get the number of secondary devices configured for the instance.
+///
+/// \param instance The model instance.
+/// \param count Returns the number of secondary devices.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count);
+
+/// Get the properties of indexed secondary device. The returned
+/// strings and other properties are owned by the instance, not the
+/// caller, and so should not be modified or freed.
+///
+/// \param instance The model instance.
+/// \param index The index of the secondary device. Must be 0
+/// <= index < count, where count is the value returned by
+/// TRITONBACKEND_ModelInstanceSecondaryDeviceCount.
+/// \param kind Returns the kind of secondary device corresponding
+/// to the index.
+/// \param id Returns the id of secondary device corresponding to the index.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
+    TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
+    int64_t* id);
+
+/// Get the model associated with a model instance.
+///
+/// \param instance The model instance.
+/// \param backend Returns the model object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceModel(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model);
+
+/// Get the user-specified state associated with the model
+/// instance. The state is completely owned and managed by the
+/// backend.
+///
+/// \param instance The model instance.
+/// \param state Returns the user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceState(
+    TRITONBACKEND_ModelInstance* instance, void** state);
+
+/// Set the user-specified state associated with the model
+/// instance. The state is completely owned and managed by the
+/// backend.
+///
+/// \param instance The model instance.
+/// \param state The user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceSetState(
+    TRITONBACKEND_ModelInstance* instance, void* state);
+
+/// Record statistics for an inference request.
+///
+/// Set 'success' true to indicate that the inference request
+/// completed successfully. In this case all timestamps should be
+/// non-zero values reported in nanoseconds and should be collected
+/// using std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+/// Set 'success' to false to indicate that the inference request failed
+/// to complete successfully. In this case all timestamps values are
+/// ignored.
+///
+/// For consistency of measurement across different backends, the
+/// timestamps should be collected at the following points during
+/// TRITONBACKEND_ModelInstanceExecute.
+///
+///   TRITONBACKEND_ModelInstanceExecute()
+///     CAPTURE TIMESPACE (exec_start_ns)
+///     < process input tensors to prepare them for inference
+///       execution, including copying the tensors to/from GPU if
+///       necessary>
+///     CAPTURE TIMESPACE (compute_start_ns)
+///     < perform inference computations to produce outputs >
+///     CAPTURE TIMESPACE (compute_end_ns)
+///     < allocate output buffers and extract output tensors, including
+///       copying the tensors to/from GPU if necessary>
+///     CAPTURE TIMESPACE (exec_end_ns)
+///     return
+///
+/// Note that these statistics are associated with a valid
+/// TRITONBACKEND_Request object and so must be reported before the
+/// request is released. For backends that release the request before
+/// all response(s) are sent, these statistics cannot capture
+/// information about the time required to produce the response.
+///
+/// \param instance The model instance.
+/// \param request The inference request that statistics are being
+/// reported for.
+/// \param success True if the inference request completed
+/// successfully, false if it failed to complete.
+/// \param exec_start_ns Timestamp for the start of execution.
+/// \param compute_start_ns Timestamp for the start of execution
+/// computations.
+/// \param compute_end_ns Timestamp for the end of execution
+/// computations.
+/// \param exec_end_ns Timestamp for the end of execution.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportStatistics(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
+    const bool success, const uint64_t exec_start_ns,
+    const uint64_t compute_start_ns, const uint64_t compute_end_ns,
+    const uint64_t exec_end_ns);
+
+/// Record statistics for the execution of an entire batch of
+/// inference requests.
+///
+/// All timestamps should be non-zero values reported in nanoseconds
+/// and should be collected using
+/// std::chrono::steady_clock::now().time_since_epoch() or the equivalent.
+/// See TRITONBACKEND_ModelInstanceReportStatistics for more information about
+/// the timestamps.
+///
+/// 'batch_size' is the sum of the batch sizes for the individual
+/// requests that were delivered together in the call to
+/// TRITONBACKEND_ModelInstanceExecute. For example, if three requests
+/// are passed to TRITONBACKEND_ModelInstanceExecute and those
+/// requests have batch size 1, 2, and 3; then 'batch_size' should be
+/// set to 6.
+///
+/// \param instance The model instance.
+/// \param batch_size Combined batch size of all the individual
+/// requests executed in the batch.
+/// \param exec_start_ns Timestamp for the start of execution.
+/// \param compute_start_ns Timestamp for the start of execution
+/// computations.
+/// \param compute_end_ns Timestamp for the end of execution
+/// computations.
+/// \param exec_end_ns Timestamp for the end of execution.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportBatchStatistics(
+    TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
+    const uint64_t exec_start_ns, const uint64_t compute_start_ns,
+    const uint64_t compute_end_ns, const uint64_t exec_end_ns);
+
+///
+/// The following functions can be implemented by a backend. Functions
+/// indicated as required must be implemented or the backend will fail
+/// to load.
+///
+
+/// Initialize a backend. This function is optional, a backend is not
+/// required to implement it. This function is called once when a
+/// backend is loaded to allow the backend to initialize any state
+/// associated with the backend. A backend has a single state that is
+/// shared across all models that use the backend.
+///
+/// \param backend The backend.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Initialize(
+    TRITONBACKEND_Backend* backend);
+
+/// Finalize for a backend. This function is optional, a backend is
+/// not required to implement it. This function is called once, just
+/// before the backend is unloaded. All state associated with the
+/// backend should be freed and any threads created for the backend
+/// should be exited/joined before returning from this function.
+///
+/// \param backend The backend.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_Finalize(
+    TRITONBACKEND_Backend* backend);
+
+/// Initialize for a model. This function is optional, a backend is
+/// not required to implement it. This function is called once when a
+/// model that uses the backend is loaded to allow the backend to
+/// initialize any state associated with the model. The backend should
+/// also examine the model configuration to determine if the
+/// configuration is suitable for the backend. Any errors reported by
+/// this function will prevent the model from loading.
+///
+/// \param model The model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(
+    TRITONBACKEND_Model* model);
+
+/// Finalize for a model. This function is optional, a backend is not
+/// required to implement it. This function is called once for a
+/// model, just before the model is unloaded from Triton. All state
+/// associated with the model should be freed and any threads created
+/// for the model should be exited/joined before returning from this
+/// function.
+///
+/// \param model The model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(
+    TRITONBACKEND_Model* model);
+
+/// Initialize for a model instance. This function is optional, a
+/// backend is not required to implement it. This function is called
+/// once when a model instance is created to allow the backend to
+/// initialize any state associated with the instance.
+///
+/// \param instance The model instance.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(
+    TRITONBACKEND_ModelInstance* instance);
+
+/// Finalize for a model instance. This function is optional, a
+/// backend is not required to implement it. This function is called
+/// once for an instance, just before the corresponding model is
+/// unloaded from Triton. All state associated with the instance
+/// should be freed and any threads created for the instance should be
+/// exited/joined before returning from this function.
+///
+/// \param instance The model instance.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(
+    TRITONBACKEND_ModelInstance* instance);
+
+/// Execute a batch of one or more requests on a model instance. This
+/// function is required. Triton will not perform multiple
+/// simultaneous calls to this function for a given model 'instance';
+/// however, there may be simultaneous calls for different model
+/// instances (for the same or different models).
+///
+/// If an error is returned the ownership of the request objects
+/// remains with Triton and the backend must not retain references to
+/// the request objects or access them in any way.
+///
+/// If success is returned, ownership of the request objects is
+/// transferred to the backend and it is then responsible for creating
+/// responses and releasing the request objects. Note that even though
+/// ownership of the request objects is transferred to the backend, the
+/// ownership of the buffer holding request pointers is returned back
+/// to Triton upon return from TRITONBACKEND_ModelInstanceExecute. If
+/// any request objects need to be maintained beyond
+/// TRITONBACKEND_ModelInstanceExecute, then the pointers must be copied
+/// out of the array within TRITONBACKEND_ModelInstanceExecute.
+///
+/// \param instance The model instance.
+/// \param requests The requests.
+/// \param request_count The number of requests in the batch.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count);
+
+/// Query the backend for different model attributes. This function is optional,
+/// a backend is not required to implement it. The backend is also not required
+/// to set all backend attribute listed. This function is called when
+/// Triton requires further backend / model information to perform operations.
+/// This function may be called multiple times within the lifetime of the
+/// backend (between TRITONBACKEND_Initialize and TRITONBACKEND_Finalize).
+/// The backend may return error to indicate failure to set the backend
+/// attributes, and the attributes specified in the same function call will be
+/// ignored. Triton will update the specified attributes if 'nullptr' is
+/// returned.
+///
+/// \param backend The backend.
+/// \param backend_attributes Return the backend attribute.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONBACKEND_ISPEC TRITONSERVER_Error* TRITONBACKEND_GetBackendAttribute(
+    TRITONBACKEND_Backend* backend,
+    TRITONBACKEND_BackendAttribute* backend_attributes);
+
+/// TRITONBACKEND_BackendAttribute
+///
+/// API to modify attributes associated with a backend.
+///
+
+/// Add the preferred instance group of the backend. This function
+/// can be called multiple times to cover different instance group kinds that
+/// the backend supports, given the priority order that the first call describes
+/// the most preferred group. In the case where instance group are not
+/// explicitly provided, Triton will use this attribute to create model
+/// deployment that aligns more with the backend preference.
+///
+/// \param backend_attributes The backend attributes object.
+/// \param kind The kind of the instance group.
+/// \param count The number of instances per device. Triton default will be used
+/// if 0 is provided.
+/// \param device_ids The devices where instances should be available. Triton
+/// default will be used if 'nullptr' is provided.
+/// \param id_count The number of devices in 'device_ids'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
+    TRITONBACKEND_BackendAttribute* backend_attributes,
+    const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
+    const uint64_t* device_ids, const uint64_t id_count);
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/core-r22.12/include/triton/core/tritonrepoagent.h
+++ b/3rdparty/core-r22.12/include/triton/core/tritonrepoagent.h
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include "triton/core/tritonserver.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _COMPILING_TRITONREPOAGENT
+#if defined(_MSC_VER)
+#define TRITONREPOAGENT_DECLSPEC __declspec(dllexport)
+#define TRITONREPOAGENT_ISPEC __declspec(dllimport)
+#elif defined(__GNUC__)
+#define TRITONREPOAGENT_DECLSPEC __attribute__((__visibility__("default")))
+#define TRITONREPOAGENT_ISPEC
+#else
+#define TRITONREPOAGENT_DECLSPEC
+#define TRITONREPOAGENT_ISPEC
+#endif
+#else
+#if defined(_MSC_VER)
+#define TRITONREPOAGENT_DECLSPEC __declspec(dllimport)
+#define TRITONREPOAGENT_ISPEC __declspec(dllexport)
+#else
+#define TRITONREPOAGENT_DECLSPEC
+#define TRITONREPOAGENT_ISPEC
+#endif
+#endif
+
+struct TRITONREPOAGENT_Agent;
+struct TRITONREPOAGENT_AgentModel;
+
+///
+/// TRITONREPOAGENT API Version
+///
+/// The TRITONREPOAGENT API is versioned with major and minor version
+/// numbers. Any change to the API that does not impact backwards
+/// compatibility (for example, adding a non-required function)
+/// increases the minor version number. Any change that breaks
+/// backwards compatibility (for example, deleting or changing the
+/// behavior of a function) increases the major version number. A
+/// repository agent should check that the API version used to compile
+/// the agent is compatible with the API version of the Triton server
+/// that it is running in. This is typically done by code similar to
+/// the following which makes sure that the major versions are equal
+/// and that the minor version of Triton is >= the minor version used
+/// to build the agent.
+///
+///   uint32_t api_version_major, api_version_minor;
+///   TRITONREPOAGENT_ApiVersion(&api_version_major, &api_version_minor);
+///   if ((api_version_major != TRITONREPOAGENT_API_VERSION_MAJOR) ||
+///       (api_version_minor < TRITONREPOAGENT_API_VERSION_MINOR)) {
+///     return TRITONSERVER_ErrorNew(
+///       TRITONSERVER_ERROR_UNSUPPORTED,
+///       "triton repository agent API version does not support this agent");
+///   }
+///
+#define TRITONREPOAGENT_API_VERSION_MAJOR 0
+#define TRITONREPOAGENT_API_VERSION_MINOR 1
+
+/// Get the TRITONREPOAGENT API version supported by Triton. This
+/// value can be compared against the
+/// TRITONREPOAGENT_API_VERSION_MAJOR and
+/// TRITONREPOAGENT_API_VERSION_MINOR used to build the agent to
+/// ensure that Triton is compatible with the agent.
+///
+/// \param major Returns the TRITONREPOAGENT API major version supported
+/// by Triton.
+/// \param minor Returns the TRITONREPOAGENT API minor version supported
+/// by Triton.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ApiVersion(
+    uint32_t* major, uint32_t* minor);
+
+/// TRITONREPOAGENT_ArtifactType
+///
+/// The ways that the files that make up a model's repository content
+/// are communicated between Triton and the agent.
+///
+///   TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
+///     communicated to and from the repository agent via a locally
+///     accessible filesystem. The agent can access these files using
+///     an appropriate filesystem API.
+///
+///   TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
+///     communicated to and from the repository agent via a remote filesystem.
+///     The remote filesystem path follows the same convention as is used for
+///     repository paths, for example, "s3://" prefix indicates an S3 path.
+///
+typedef enum TRITONREPOAGENT_artifacttype_enum {
+  TRITONREPOAGENT_ARTIFACT_FILESYSTEM,
+  TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM
+} TRITONREPOAGENT_ArtifactType;
+
+/// TRITONREPOAGENT_ActionType
+///
+/// Types of repository actions that can be handled by an agent.
+/// The lifecycle of a TRITONREPOAGENT_AgentModel begins with a call to
+/// TRITONREPOAGENT_ModelInitialize and ends with a call to
+/// TRITONREPOAGENT_ModelFinalize. Between those calls the current lifecycle
+/// state of the model is communicated by calls to TRITONREPOAGENT_ModelAction.
+/// Possible lifecycles are:
+///
+/// LOAD -> LOAD_COMPLETE -> UNLOAD -> UNLOAD_COMPLETE
+/// LOAD -> LOAD_FAIL
+///
+///   TRITONREPOAGENT_ACTION_LOAD: A model is being loaded.
+///
+///   TRITONREPOAGENT_ACTION_LOAD_COMPLETE: The model load completed
+///     successfully and the model is now loaded.
+///
+///   TRITONREPOAGENT_ACTION_LOAD_FAIL: The model load did not complete
+///     successfully. The model is not loaded.
+///
+///   TRITONREPOAGENT_ACTION_UNLOAD: The model is being unloaded.
+///
+///   TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE: The model unload is complete.
+///
+typedef enum TRITONREPOAGENT_actiontype_enum {
+  TRITONREPOAGENT_ACTION_LOAD,
+  TRITONREPOAGENT_ACTION_LOAD_COMPLETE,
+  TRITONREPOAGENT_ACTION_LOAD_FAIL,
+  TRITONREPOAGENT_ACTION_UNLOAD,
+  TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE
+} TRITONREPOAGENT_ActionType;
+
+/// Get the location of the files that make up the model. The
+/// 'location' communicated depends on how the model is being
+/// communicated to the agent as indicated by 'artifact_type'.
+///
+///   TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
+///     made available to the agent via the local
+///     filesytem. 'location' returns the full path to the directory
+///     in the model repository that contains the model's
+///     artifacts. The returned location string is owned by Triton,
+///     not the caller, and so should not be modified or freed. The
+///     contents of the directory are owned by Triton, not the agent,
+///     and so the agent should not delete or modify the contents. Use
+///     TRITONREPOAGENT_RepositoryAcquire to get a location that can be
+///     used to modify the model repository contents.
+///
+///   TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
+///     made available to the agent via a remote filesystem.
+///     'location' returns the full path to the remote directory that contains
+///     the model's artifacts. The returned location string is owned by Triton,
+///     not the caller, and so should not be modified or freed. The contents of
+///     the remote directory are owned by Triton, not the agent,
+///     and so the agent should not delete or modify the contents.
+///     Use TRITONREPOAGENT_ModelRepositoryLocationAcquire to get a location
+///     that can be used to write updated model repository contents.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param artifact_type Returns the artifact type for the location.
+/// \param path Returns the location.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocation(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    TRITONREPOAGENT_ArtifactType* artifact_type, const char** location);
+
+/// Acquire a location where the agent can produce a new version of
+/// the model repository files. This is a convenience method to create
+/// a temporary directory for the agent. The agent is responsible for
+/// calling TRITONREPOAGENT_ModelRepositoryLocationDelete in
+/// TRITONREPOAGENT_ModelFinalize to delete the location. Initially the
+/// acquired location is empty. The 'location' communicated depends on
+/// the requested 'artifact_type'.
+///
+///   TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The location is a directory
+///     on the local filesystem. 'location' returns the full path to
+///     an empty directory that the agent should populate with the
+///     model's artifacts. The returned location string is owned by
+///     Triton, not the agent, and so should not be modified or freed.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param artifact_type The artifact type for the location.
+/// \param path Returns the location.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocationAcquire(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const TRITONREPOAGENT_ArtifactType artifact_type, const char** location);
+
+/// Discard and release ownership of a previously acquired location
+/// and its contents. The agent must not access or modify the location
+/// or its contents after this call.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param path The location to release.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocationRelease(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const char* location);
+
+/// Inform Triton that the specified repository location should be used for
+/// the model in place of the original model repository. This method can only be
+/// called when TRITONREPOAGENT_ModelAction is invoked with
+/// TRITONREPOAGENT_ACTION_LOAD. The 'location' The 'location'
+/// communicated depends on how the repository is being
+/// communicated to Triton as indicated by 'artifact_type'.
+///
+///   TRITONREPOAGENT_ARTIFACT_FILESYSTEM: The model artifacts are
+///     made available to Triton via the local filesytem. 'location' returns
+///     the full path to the directory. Ownership of the contents of the
+///     returned directory are transferred to Triton and the agent should not
+///     modified or freed the contents until TRITONREPOAGENT_ModelFinalize.
+///     The local filesystem directory can be created using
+///     TRITONREPOAGENT_ModelReopsitroyLocationAcquire or the agent can use
+///     its own local filesystem API.
+///
+///   TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM: The model artifacts are
+///     made available to Triton via a remote filesystem. 'location' returns
+///     the full path to the remote filesystem directory. Ownership of the
+///     contents of the returned directory are transferred to Triton and
+///     the agent should not modified or freed the contents until
+///     TRITONREPOAGENT_ModelFinalize.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param artifact_type The artifact type for the location.
+/// \param path Returns the location.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryUpdate(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const TRITONREPOAGENT_ArtifactType artifact_type, const char* location);
+
+/// Get the number of agent parameters defined for a model.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param count Returns the number of input tensors.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelParameterCount(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    uint32_t* count);
+
+/// Get a parameter name and value. The caller does not own the
+/// returned strings and must not modify or delete them.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param index The index of the parameter. Must be 0 <= index <
+/// count, where count is the value returned by
+/// TRITONREPOAGENT_ModelParameterCount.
+/// \param parameter_name Returns the name of the parameter.
+/// \param parameter_value Returns the value of the parameter.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelParameter(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const uint32_t index, const char** parameter_name,
+    const char** parameter_value);
+
+/// Get the model configuration. The caller takes ownership of the
+/// message object and must call TRITONSERVER_MessageDelete to release
+/// the object. If the model repository does not contain a
+/// config.pbtxt file then 'model_config' is returned as nullptr.
+///
+/// \param agent The agent.
+/// \param model The model.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned. Currently only version 1 is supported.
+/// \param model_config Returns the model configuration as a message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelConfig(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const uint32_t config_version, TRITONSERVER_Message** model_config);
+
+/// Get the user-specified state associated with the model.
+///
+/// \param model The agent model.
+/// \param state Returns the user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelState(
+    TRITONREPOAGENT_AgentModel* model, void** state);
+
+/// Set the user-specified state associated with the model.
+///
+/// \param model The agent model.
+/// \param state The user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelSetState(
+    TRITONREPOAGENT_AgentModel* model, void* state);
+
+/// Get the user-specified state associated with the agent.
+///
+/// \param agent The agent.
+/// \param state Returns the user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_State(
+    TRITONREPOAGENT_Agent* agent, void** state);
+
+/// Set the user-specified state associated with the agent.
+///
+/// \param agent The agent.
+/// \param state The user state, or nullptr if no user state.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_DECLSPEC TRITONSERVER_Error* TRITONREPOAGENT_SetState(
+    TRITONREPOAGENT_Agent* agent, void* state);
+
+///
+/// The following functions can be implemented by an agent. Functions
+/// indicated as required must be implemented or the agent will fail
+/// to load.
+///
+
+/// Initialize an agent. This function is optional. This function is
+/// called once when an agent is loaded to allow the agent to
+/// initialize any state associated with the agent. An agent has a
+/// single state that is shared across all invocations of the agent.
+///
+/// \param agent The agent.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_Initialize(
+    TRITONREPOAGENT_Agent* agent);
+
+/// Finalize for an agent. This function is optional. This function is
+/// called once, just before the agent is unloaded. All state
+/// associated with the agent should be freed and any threads created
+/// for the agent should be exited/joined before returning from this
+/// function.
+///
+/// \param agent The agent.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_Finalize(
+    TRITONREPOAGENT_Agent* agent);
+
+/// Initialize a model associated with an agent. This function is optional.
+/// This function is called once when an agent model's lifecycle begins to allow
+/// the agent model to initialize any state associated with it. An agent model
+/// has a single state that is shared across all the lifecycle of the agent
+/// model.
+///
+/// \param agent The agent to be associated with the model.
+/// \param model The model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelInitialize(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
+
+/// Finalize for a model. This function is optional. This function is
+/// called once, just before the end of the agent model's lifecycle. All state
+/// associated with the agent model should be freed and any threads created
+/// for the agent model should be exited/joined before returning from this
+/// function. If the model acquired a model location using
+/// TRITONREPOAGENT_ModelRepositoryLocationAcquire, it must call
+/// TRITONREPOAGENT_ModelRepositoryLocationRelease to release that location.
+///
+/// \param agent The agent associated with the model.
+/// \param model The model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelFinalize(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
+
+/// Handle an action for a specified model. This function is
+/// required. Triton will not perform multiple simultaneous calls to
+/// this function for a given agent and model; however, there may be
+/// simultaneous calls for the agent for different models.
+///
+/// If the agent does not handle the action the agent should
+/// immediately return success (nullptr).
+///
+/// Any modification to the model's repository must be made when 'action_type'
+/// is TRITONREPOAGENT_ACTION_LOAD.
+/// To modify the model's repository the agent must either acquire a mutable
+/// location via TRITONREPOAGENT_ModelRepositoryLocationAcquire
+/// or its own managed location, report the location to Triton via
+/// TRITONREPOAGENT_ModelRepositoryUpdate, and then return
+/// success (nullptr). If the agent does not need to make any changes
+/// to the model repository it should not call
+/// TRITONREPOAGENT_ModelRepositoryUpdate and then return success.
+/// To indicate that a model load should fail return a non-success status.
+///
+/// \param agent The agent.
+/// \param model The model that is the target of the action.
+/// \action_type The type of action the agent should handle for the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONREPOAGENT_ISPEC TRITONSERVER_Error* TRITONREPOAGENT_ModelAction(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const TRITONREPOAGENT_ActionType action_type);
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/core-r22.12/include/triton/core/tritonserver.h
+++ b/3rdparty/core-r22.12/include/triton/core/tritonserver.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+/// \file
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _COMPILING_TRITONSERVER
+#if defined(_MSC_VER)
+#define TRITONSERVER_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONSERVER_DECLSPEC
+#endif
+#else
+#if defined(_MSC_VER)
+#define TRITONSERVER_DECLSPEC __declspec(dllimport)
+#else
+#define TRITONSERVER_DECLSPEC
+#endif
+#endif
+
+struct TRITONSERVER_BufferAttributes;
+struct TRITONSERVER_Error;
+struct TRITONSERVER_InferenceRequest;
+struct TRITONSERVER_InferenceResponse;
+struct TRITONSERVER_InferenceTrace;
+struct TRITONSERVER_Message;
+struct TRITONSERVER_Metrics;
+struct TRITONSERVER_Parameter;
+struct TRITONSERVER_ResponseAllocator;
+struct TRITONSERVER_Server;
+struct TRITONSERVER_ServerOptions;
+struct TRITONSERVER_Metric;
+struct TRITONSERVER_MetricFamily;
+
+///
+/// TRITONSERVER API Version
+///
+/// The TRITONSERVER API is versioned with major and minor version
+/// numbers. Any change to the API that does not impact backwards
+/// compatibility (for example, adding a non-required function)
+/// increases the minor version number. Any change that breaks
+/// backwards compatibility (for example, deleting or changing the
+/// behavior of a function) increases the major version number. A
+/// client should check that the API version used to compile the
+/// client is compatible with the API version of the Triton shared
+/// library that it is linking against. This is typically done by code
+/// similar to the following which makes sure that the major versions
+/// are equal and that the minor version of the Triton shared library
+/// is >= the minor version used to build the client.
+///
+///   uint32_t api_version_major, api_version_minor;
+///   TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
+///   if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
+///       (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
+///     return TRITONSERVER_ErrorNew(
+///       TRITONSERVER_ERROR_UNSUPPORTED,
+///       "triton server API version does not support this client");
+///   }
+///
+#define TRITONSERVER_API_VERSION_MAJOR 1
+#define TRITONSERVER_API_VERSION_MINOR 17
+
+/// Get the TRITONBACKEND API version supported by the Triton shared
+/// library. This value can be compared against the
+/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
+/// used to build the client to ensure that Triton shared library is
+/// compatible with the client.
+///
+/// \param major Returns the TRITONSERVER API major version supported
+/// by Triton.
+/// \param minor Returns the TRITONSERVER API minor version supported
+/// by Triton.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ApiVersion(
+    uint32_t* major, uint32_t* minor);
+
+/// TRITONSERVER_DataType
+///
+/// Tensor data types recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_datatype_enum {
+  TRITONSERVER_TYPE_INVALID,
+  TRITONSERVER_TYPE_BOOL,
+  TRITONSERVER_TYPE_UINT8,
+  TRITONSERVER_TYPE_UINT16,
+  TRITONSERVER_TYPE_UINT32,
+  TRITONSERVER_TYPE_UINT64,
+  TRITONSERVER_TYPE_INT8,
+  TRITONSERVER_TYPE_INT16,
+  TRITONSERVER_TYPE_INT32,
+  TRITONSERVER_TYPE_INT64,
+  TRITONSERVER_TYPE_FP16,
+  TRITONSERVER_TYPE_FP32,
+  TRITONSERVER_TYPE_FP64,
+  TRITONSERVER_TYPE_BYTES,
+  TRITONSERVER_TYPE_BF16
+} TRITONSERVER_DataType;
+
+/// Get the string representation of a data type. The returned string
+/// is not owned by the caller and so should not be modified or freed.
+///
+/// \param datatype The data type.
+/// \return The string representation of the data type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_DataTypeString(
+    TRITONSERVER_DataType datatype);
+
+/// Get the Triton datatype corresponding to a string representation
+/// of a datatype.
+///
+/// \param dtype The datatype string representation.
+/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
+/// string does not represent a data type.
+TRITONSERVER_DECLSPEC TRITONSERVER_DataType
+TRITONSERVER_StringToDataType(const char* dtype);
+
+/// Get the size of a Triton datatype in bytes. Zero is returned for
+/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
+/// returned for TRITONSERVER_TYPE_INVALID.
+///
+/// \param dtype The datatype.
+/// \return The size of the datatype.
+TRITONSERVER_DECLSPEC uint32_t
+TRITONSERVER_DataTypeByteSize(TRITONSERVER_DataType datatype);
+
+/// TRITONSERVER_MemoryType
+///
+/// Types of memory recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_memorytype_enum {
+  TRITONSERVER_MEMORY_CPU,
+  TRITONSERVER_MEMORY_CPU_PINNED,
+  TRITONSERVER_MEMORY_GPU
+} TRITONSERVER_MemoryType;
+
+/// Get the string representation of a memory type. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param memtype The memory type.
+/// \return The string representation of the memory type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_MemoryTypeString(
+    TRITONSERVER_MemoryType memtype);
+
+/// TRITONSERVER_ParameterType
+///
+/// Types of parameters recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_parametertype_enum {
+  TRITONSERVER_PARAMETER_STRING,
+  TRITONSERVER_PARAMETER_INT,
+  TRITONSERVER_PARAMETER_BOOL,
+  TRITONSERVER_PARAMETER_BYTES
+} TRITONSERVER_ParameterType;
+
+/// Get the string representation of a parameter type. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param paramtype The parameter type.
+/// \return The string representation of the parameter type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ParameterTypeString(
+    TRITONSERVER_ParameterType paramtype);
+
+/// Create a new parameter object. The caller takes ownership of the
+/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
+/// release the object. The object will maintain its own copy of the 'value'
+///
+/// \param name The parameter name.
+/// \param type The parameter type.
+/// \param value The pointer to the value.
+/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
+/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
+/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
+TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterNew(
+    const char* name, const TRITONSERVER_ParameterType type, const void* value);
+
+/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
+/// The caller takes ownership of the TRITONSERVER_Parameter object and must
+/// call TRITONSERVER_ParameterDelete to release the object. The object only
+/// maintains a shallow copy of the 'byte_ptr' so the data content must be
+/// valid until the parameter object is deleted.
+///
+/// \param name The parameter name.
+/// \param byte_ptr The pointer to the data content.
+/// \param size The size of the data content.
+/// \return A new TRITONSERVER_Error object.
+TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterBytesNew(
+    const char* name, const void* byte_ptr, const uint64_t size);
+
+/// Delete an parameter object.
+///
+/// \param parameter The parameter object.
+TRITONSERVER_DECLSPEC void TRITONSERVER_ParameterDelete(
+    TRITONSERVER_Parameter* parameter);
+
+/// TRITONSERVER_InstanceGroupKind
+///
+/// Kinds of instance groups recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_instancegroupkind_enum {
+  TRITONSERVER_INSTANCEGROUPKIND_AUTO,
+  TRITONSERVER_INSTANCEGROUPKIND_CPU,
+  TRITONSERVER_INSTANCEGROUPKIND_GPU,
+  TRITONSERVER_INSTANCEGROUPKIND_MODEL
+} TRITONSERVER_InstanceGroupKind;
+
+/// Get the string representation of an instance-group kind. The
+/// returned string is not owned by the caller and so should not be
+/// modified or freed.
+///
+/// \param kind The instance-group kind.
+/// \return The string representation of the kind.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InstanceGroupKindString(
+    TRITONSERVER_InstanceGroupKind kind);
+
+/// TRITONSERVER_Logging
+///
+/// Types/levels of logging.
+///
+typedef enum TRITONSERVER_loglevel_enum {
+  TRITONSERVER_LOG_INFO,
+  TRITONSERVER_LOG_WARN,
+  TRITONSERVER_LOG_ERROR,
+  TRITONSERVER_LOG_VERBOSE
+} TRITONSERVER_LogLevel;
+
+///
+/// Format of logging.
+///
+/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
+/// logged as "LMMDD hh:mm:ss.ssssss".
+///
+/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
+///
+typedef enum TRITONSERVER_logformat_enum {
+  TRITONSERVER_LOG_DEFAULT,
+  TRITONSERVER_LOG_ISO8601
+} TRITONSERVER_LogFormat;
+
+/// Is a log level enabled?
+///
+/// \param level The log level.
+/// \return True if the log level is enabled, false if not enabled.
+TRITONSERVER_DECLSPEC bool TRITONSERVER_LogIsEnabled(
+    TRITONSERVER_LogLevel level);
+
+/// Log a message at a given log level if that level is enabled.
+///
+/// \param level The log level.
+/// \param filename The file name of the location of the log message.
+/// \param line The line number of the log message.
+/// \param msg The log message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_LogMessage(
+    TRITONSERVER_LogLevel level, const char* filename, const int line,
+    const char* msg);
+
+/// TRITONSERVER_Error
+///
+/// Errors are reported by a TRITONSERVER_Error object. A NULL
+/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
+/// indicates error and the code and message for the error can be
+/// retrieved from the object.
+///
+/// The caller takes ownership of a TRITONSERVER_Error object returned by
+/// the API and must call TRITONSERVER_ErrorDelete to release the object.
+///
+
+/// The TRITONSERVER_Error error codes
+typedef enum TRITONSERVER_errorcode_enum {
+  TRITONSERVER_ERROR_UNKNOWN,
+  TRITONSERVER_ERROR_INTERNAL,
+  TRITONSERVER_ERROR_NOT_FOUND,
+  TRITONSERVER_ERROR_INVALID_ARG,
+  TRITONSERVER_ERROR_UNAVAILABLE,
+  TRITONSERVER_ERROR_UNSUPPORTED,
+  TRITONSERVER_ERROR_ALREADY_EXISTS
+} TRITONSERVER_Error_Code;
+
+/// Create a new error object. The caller takes ownership of the
+/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
+/// release the object.
+///
+/// \param code The error code.
+/// \param msg The error message.
+/// \return A new TRITONSERVER_Error object.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ErrorNew(
+    TRITONSERVER_Error_Code code, const char* msg);
+
+/// Delete an error object.
+///
+/// \param error The error object.
+TRITONSERVER_DECLSPEC void TRITONSERVER_ErrorDelete(TRITONSERVER_Error* error);
+
+/// Get the error code.
+///
+/// \param error The error object.
+/// \return The error code.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error_Code
+TRITONSERVER_ErrorCode(TRITONSERVER_Error* error);
+
+/// Get the string representation of an error code. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed. The lifetime of the returned string extends only as long as
+/// 'error' and must not be accessed once 'error' is deleted.
+///
+/// \param error The error object.
+/// \return The string representation of the error code.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorCodeString(
+    TRITONSERVER_Error* error);
+
+/// Get the error message. The returned string is not owned by the
+/// caller and so should not be modified or freed. The lifetime of the
+/// returned string extends only as long as 'error' and must not be
+/// accessed once 'error' is deleted.
+///
+/// \param error The error object.
+/// \return The error message.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorMessage(
+    TRITONSERVER_Error* error);
+
+/// TRITONSERVER_ResponseAllocator
+///
+/// Object representing a memory allocator for output tensors in an
+/// inference response.
+///
+
+/// Type for allocation function that allocates a buffer to hold an
+/// output tensor.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor to allocate for.
+/// \param byte_size The size of the buffer to allocate.
+/// \param memory_type The type of memory that the caller prefers for
+/// the buffer allocation.
+/// \param memory_type_id The ID of the memory that the caller prefers
+/// for the buffer allocation.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer Returns a pointer to the allocated memory.
+/// \param buffer_userp Returns a user-specified value to associate
+/// with the buffer, or nullptr if no user-specified value should be
+/// associated with the buffer. This value will be provided in the
+/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
+/// is released and will also be returned by
+/// TRITONSERVER_InferenceResponseOutput.
+/// \param actual_memory_type Returns the type of memory where the
+/// allocation resides. May be different than the type of memory
+/// requested by 'memory_type'.
+/// \param actual_memory_type_id Returns the ID of the memory where
+/// the allocation resides. May be different than the ID of the memory
+/// requested by 'memory_type_id'.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting an allocation. If an error is returned all other return
+/// values will be ignored.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorAllocFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, void* userp, void** buffer, void** buffer_userp,
+    TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id);
+
+/// Type for allocation function that allocates a buffer to hold an
+/// output tensor with buffer attributes. The callback function must fill in the
+/// appropriate buffer attributes information related to this buffer. If set,
+/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
+/// function.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor to allocate for.
+/// \param buffer_attributes The buffer attributes associated with the buffer.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer_userp Returns a user-specified value to associate
+/// with the buffer, or nullptr if no user-specified value should be
+/// associated with the buffer. This value will be provided in the
+/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
+/// is released and will also be returned by
+/// TRITONSERVER_InferenceResponseOutput.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting an allocation. If an error is returned all other return
+/// values will be ignored.
+typedef TRITONSERVER_Error* (
+    *TRITONSERVER_ResponseAllocatorBufferAttributesFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    TRITONSERVER_BufferAttributes* buffer_attributes, void* userp,
+    void* buffer_userp);
+
+/// Type for function that is called to query the allocator's preferred memory
+/// type and memory type ID. As much as possible, the allocator should attempt
+/// to return the same memory_type and memory_type_id values that will be
+/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// But the allocator is not required to do so.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor. This is optional
+/// and it should be set to nullptr to indicate that the tensor name has
+/// not determined.
+/// \param byte_size The expected size of the buffer. This is optional
+/// and it should be set to nullptr to indicate that the byte size has
+/// not determined.
+/// \param memory_type Acts as both input and output. On input gives
+/// the memory type preferred by the caller. Returns memory type preferred
+/// by the allocator, taken account of the caller preferred type.
+/// \param memory_type_id Acts as both input and output. On input gives
+/// the memory type ID preferred by the caller. Returns memory type ID preferred
+/// by the allocator, taken account of the caller preferred type ID.
+/// \return a TRITONSERVER_Error object if a failure occurs.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorQueryFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* userp,
+    const char* tensor_name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+
+/// Type for function that is called when the server no longer holds
+/// any reference to a buffer allocated by
+/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
+/// is typically called when the response object associated with the
+/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer Pointer to the buffer to be freed.
+/// \param buffer_userp The user-specified value associated
+/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// \param byte_size The size of the buffer.
+/// \param memory_type The type of memory holding the buffer.
+/// \param memory_type_id The ID of the memory holding the buffer.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting the release. If an error is returned Triton will not
+/// attempt to release the buffer again.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorReleaseFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id);
+
+/// Type for function that is called to indicate that subsequent
+/// allocation requests will refer to a new response.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \return a TRITONSERVER_Error object if a failure occurs.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorStartFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* userp);
+
+/// Create a new response allocator object.
+///
+/// The response allocator object is used by Triton to allocate
+/// buffers to hold the output tensors in inference responses. Most
+/// models generate a single response for each inference request
+/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
+/// callbacks will be:
+///
+///   TRITONSERVER_ServerInferAsync called
+///    - start_fn : optional (and typically not required)
+///    - alloc_fn : called once for each output tensor in response
+///   TRITONSERVER_InferenceResponseDelete called
+///    - release_fn: called once for each output tensor in response
+///
+/// For models that generate multiple responses for each inference
+/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
+/// used to determine sets of alloc_fn callbacks that belong to the
+/// same response:
+///
+///   TRITONSERVER_ServerInferAsync called
+///    - start_fn
+///    - alloc_fn : called once for each output tensor in response
+///    - start_fn
+///    - alloc_fn : called once for each output tensor in response
+///      ...
+///   For each response, TRITONSERVER_InferenceResponseDelete called
+///    - release_fn: called once for each output tensor in the response
+///
+/// In all cases the start_fn, alloc_fn and release_fn callback
+/// functions must be thread-safe. Typically making these functions
+/// thread-safe does not require explicit locking. The recommended way
+/// to implement these functions is to have each inference request
+/// provide a 'response_allocator_userp' object that is unique to that
+/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
+/// callback functions then operate only on this unique state. Locking
+/// is required only when the callback function needs to access state
+/// that is shared across inference requests (for example, a common
+/// allocation pool).
+///
+/// \param allocator Returns the new response allocator object.
+/// \param alloc_fn The function to call to allocate buffers for result
+/// tensors.
+/// \param release_fn The function to call when the server no longer
+/// holds a reference to an allocated buffer.
+/// \param start_fn The function to call to indicate that the
+/// subsequent 'alloc_fn' calls are for a new response. This callback
+/// is optional (use nullptr to indicate that it should not be
+/// invoked).
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorNew(
+    TRITONSERVER_ResponseAllocator** allocator,
+    TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
+    TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
+    TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
+
+/// Set the buffer attributes function for a response allocator object.
+/// The function will be called after alloc_fn to set the buffer attributes
+/// associated with the output buffer.
+///
+/// The thread-safy requirement for buffer_attributes_fn is the same as other
+/// allocator callbacks.
+///
+/// \param allocator The response allocator object.
+/// \param buffer_attributes_fn The function to call to get the buffer
+/// attributes information for an allocated buffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction(
+    TRITONSERVER_ResponseAllocator* allocator,
+    TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn);
+
+/// Set the query function to a response allocator object. Usually the
+/// function will be called before alloc_fn to understand what is the
+/// allocator's preferred memory type and memory type ID at the current
+/// situation to make different execution decision.
+///
+/// The thread-safy requirement for query_fn is the same as other allocator
+/// callbacks.
+///
+/// \param allocator The response allocator object.
+/// \param query_fn The function to call to query allocator's preferred memory
+/// type and memory type ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ResponseAllocatorSetQueryFunction(
+    TRITONSERVER_ResponseAllocator* allocator,
+    TRITONSERVER_ResponseAllocatorQueryFn_t query_fn);
+
+/// Delete a response allocator.
+///
+/// \param allocator The response allocator object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorDelete(
+    TRITONSERVER_ResponseAllocator* allocator);
+
+/// TRITONSERVER_Message
+///
+/// Object representing a Triton Server message.
+///
+
+/// Create a new message object from serialized JSON string.
+///
+/// \param message The message object.
+/// \param base The base of the serialized JSON.
+/// \param byte_size The size, in bytes, of the serialized message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_MessageNewFromSerializedJson(
+    TRITONSERVER_Message** message, const char* base, size_t byte_size);
+
+/// Delete a message object.
+///
+/// \param message The message object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageDelete(
+    TRITONSERVER_Message* message);
+
+/// Get the base and size of the buffer containing the serialized
+/// message in JSON format. The buffer is owned by the
+/// TRITONSERVER_Message object and should not be modified or freed by
+/// the caller. The lifetime of the buffer extends only as long as
+/// 'message' and must not be accessed once 'message' is deleted.
+///
+/// \param message The message object.
+/// \param base Returns the base of the serialized message.
+/// \param byte_size Returns the size, in bytes, of the serialized
+/// message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageSerializeToJson(
+    TRITONSERVER_Message* message, const char** base, size_t* byte_size);
+
+/// TRITONSERVER_Metrics
+///
+/// Object representing metrics.
+///
+
+/// Metric format types
+typedef enum tritonserver_metricformat_enum {
+  TRITONSERVER_METRIC_PROMETHEUS
+} TRITONSERVER_MetricFormat;
+
+/// Delete a metrics object.
+///
+/// \param metrics The metrics object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsDelete(
+    TRITONSERVER_Metrics* metrics);
+
+/// Get a buffer containing the metrics in the specified format. For
+/// each format the buffer contains the following:
+///
+///   TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
+///   string (char*) that gives a text representation of the metrics in
+///   prometheus format. 'byte_size' returns the length of the string
+///   in bytes.
+///
+/// The buffer is owned by the 'metrics' object and should not be
+/// modified or freed by the caller. The lifetime of the buffer
+/// extends only as long as 'metrics' and must not be accessed once
+/// 'metrics' is deleted.
+///
+/// \param metrics The metrics object.
+/// \param format The format to use for the returned metrics.
+/// \param base Returns a pointer to the base of the formatted
+/// metrics, as described above.
+/// \param byte_size Returns the size, in bytes, of the formatted
+/// metrics.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsFormatted(
+    TRITONSERVER_Metrics* metrics, TRITONSERVER_MetricFormat format,
+    const char** base, size_t* byte_size);
+
+/// TRITONSERVER_InferenceTrace
+///
+/// Object that represents tracing for an inference request.
+///
+
+/// Trace levels. The trace level controls the type of trace
+/// activities that are reported for an inference request.
+///
+/// Trace level values are power-of-2 and can be combined to trace
+/// multiple types of activities. For example, use
+/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
+/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
+/// tensors for an inference request.
+///
+/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
+/// deprecated and should not be used.
+typedef enum tritonserver_tracelevel_enum {
+  /// Tracing disabled. No trace activities are reported.
+  TRITONSERVER_TRACE_LEVEL_DISABLED = 0,
+  /// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
+  TRITONSERVER_TRACE_LEVEL_MIN = 1,
+  /// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
+  TRITONSERVER_TRACE_LEVEL_MAX = 2,
+  /// Record timestamps for the inference request.
+  TRITONSERVER_TRACE_LEVEL_TIMESTAMPS = 0x4,
+  /// Record input and output tensor values for the inference request.
+  TRITONSERVER_TRACE_LEVEL_TENSORS = 0x8
+} TRITONSERVER_InferenceTraceLevel;
+
+/// Get the string representation of a trace level. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param level The trace level.
+/// \return The string representation of the trace level.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceLevelString(
+    TRITONSERVER_InferenceTraceLevel level);
+
+/// Trace activities
+typedef enum tritonserver_traceactivity_enum {
+  TRITONSERVER_TRACE_REQUEST_START = 0,
+  TRITONSERVER_TRACE_QUEUE_START = 1,
+  TRITONSERVER_TRACE_COMPUTE_START = 2,
+  TRITONSERVER_TRACE_COMPUTE_INPUT_END = 3,
+  TRITONSERVER_TRACE_COMPUTE_OUTPUT_START = 4,
+  TRITONSERVER_TRACE_COMPUTE_END = 5,
+  TRITONSERVER_TRACE_REQUEST_END = 6,
+  TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7,
+  TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8,
+  TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9
+} TRITONSERVER_InferenceTraceActivity;
+
+/// Get the string representation of a trace activity. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param activity The trace activity.
+/// \return The string representation of the trace activity.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceActivityString(
+    TRITONSERVER_InferenceTraceActivity activity);
+
+/// Type for trace timeline activity callback function. This callback function
+/// is used to report activity occurring for a trace. This function
+/// does not take ownership of 'trace' and so any information needed
+/// from that object must be copied before returning. The 'userp' data
+/// is the same as what is supplied in the call to
+/// TRITONSERVER_InferenceTraceNew.
+typedef void (*TRITONSERVER_InferenceTraceActivityFn_t)(
+    TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
+    void* userp);
+
+/// Type for trace tensor activity callback function. This callback function
+/// is used to report tensor activity occurring for a trace. This function
+/// does not take ownership of 'trace' and so any information needed
+/// from that object must be copied before returning. The 'userp' data
+/// is the same as what is supplied in the call to
+/// TRITONSERVER_InferenceTraceTensorNew.
+typedef void (*TRITONSERVER_InferenceTraceTensorActivityFn_t)(
+    TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTraceActivity activity, const char* name,
+    TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
+    const int64_t* shape, uint64_t dim_count,
+    TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp);
+
+/// Type for trace release callback function. This callback function
+/// is called when all activity for the trace has completed. The
+/// callback function takes ownership of the
+/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
+/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
+typedef void (*TRITONSERVER_InferenceTraceReleaseFn_t)(
+    TRITONSERVER_InferenceTrace* trace, void* userp);
+
+/// Create a new inference trace object. The caller takes ownership of
+/// the TRITONSERVER_InferenceTrace object and must call
+/// TRITONSERVER_InferenceTraceDelete to release the object.
+///
+/// The activity callback function will be called to report activity
+/// for 'trace' as well as for any child traces that are spawned by
+/// 'trace', and so the activity callback must check the trace object
+/// to determine specifically what activity is being reported.
+///
+/// The release callback is called for both 'trace' and for any child
+/// traces spawned by 'trace'.
+///
+/// \param trace Returns the new inference trace object.
+/// \param level The tracing level.
+/// \param parent_id The parent trace id for this trace. A value of 0
+/// indicates that there is not parent trace.
+/// \param activity_fn The callback function where activity for the
+/// trace is reported.
+/// \param release_fn The callback function called when all activity
+/// is complete for the trace.
+/// \param trace_userp User-provided pointer that is delivered to
+/// the activity and release callback functions.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceNew(
+    TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
+    uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
+    TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
+
+/// Create a new inference trace object. The caller takes ownership of
+/// the TRITONSERVER_InferenceTrace object and must call
+/// TRITONSERVER_InferenceTraceDelete to release the object.
+///
+/// The timeline and tensor activity callback function will be called to report
+/// activity for 'trace' as well as for any child traces that are spawned by
+/// 'trace', and so the activity callback must check the trace object
+/// to determine specifically what activity is being reported.
+///
+/// The release callback is called for both 'trace' and for any child
+/// traces spawned by 'trace'.
+///
+/// \param trace Returns the new inference trace object.
+/// \param level The tracing level.
+/// \param parent_id The parent trace id for this trace. A value of 0
+/// indicates that there is not parent trace.
+/// \param activity_fn The callback function where timeline activity for the
+/// trace is reported.
+/// \param tensor_activity_fn The callback function where tensor activity for
+/// the trace is reported.
+/// \param release_fn The callback function called when all activity
+/// is complete for the trace.
+/// \param trace_userp User-provided pointer that is delivered to
+/// the activity and release callback functions.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceTensorNew(
+    TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
+    uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
+    TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
+    TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
+
+/// Delete a trace object.
+///
+/// \param trace The trace object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceDelete(
+    TRITONSERVER_InferenceTrace* trace);
+
+/// Get the id associated with a trace. Every trace is assigned an id
+/// that is unique across all traces created for a Triton server.
+///
+/// \param trace The trace.
+/// \param id Returns the id associated with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceId(
+    TRITONSERVER_InferenceTrace* trace, uint64_t* id);
+
+/// Get the parent id associated with a trace. The parent id indicates
+/// a parent-child relationship between two traces. A parent id value
+/// of 0 indicates that there is no parent trace.
+///
+/// \param trace The trace.
+/// \param id Returns the parent id associated with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceParentId(
+    TRITONSERVER_InferenceTrace* trace, uint64_t* parent_id);
+
+/// Get the name of the model associated with a trace. The caller does
+/// not own the returned string and must not modify or delete it. The
+/// lifetime of the returned string extends only as long as 'trace'.
+///
+/// \param trace The trace.
+/// \param model_name Returns the name of the model associated with
+/// the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceModelName(
+    TRITONSERVER_InferenceTrace* trace, const char** model_name);
+
+/// Get the version of the model associated with a trace.
+///
+/// \param trace The trace.
+/// \param model_version Returns the version of the model associated
+/// with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceTraceModelVersion(
+    TRITONSERVER_InferenceTrace* trace, int64_t* model_version);
+
+/// TRITONSERVER_InferenceRequest
+///
+/// Object representing an inference request. The inference request
+/// provides the meta-data and input tensor values needed for an
+/// inference and returns the inference result meta-data and output
+/// tensors. An inference request object can be modified and reused
+/// multiple times.
+///
+
+/// Inference request flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_requestflag_enum {
+  TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1,
+  TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2
+} TRITONSERVER_RequestFlag;
+
+/// Inference request release flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_requestreleaseflag_enum {
+  TRITONSERVER_REQUEST_RELEASE_ALL = 1
+} TRITONSERVER_RequestReleaseFlag;
+
+/// Inference response complete flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_responsecompleteflag_enum {
+  TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1
+} TRITONSERVER_ResponseCompleteFlag;
+
+/// Type for inference request release callback function. The callback
+/// indicates what type of release is being performed on the request
+/// and for some of these the callback function takes ownership of the
+/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
+/// provided as 'request_release_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetReleaseCallback.
+///
+/// One or more flags will be specified when the callback is invoked,
+/// and the callback must take the following actions:
+///
+///   - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
+///     is being released and ownership is passed to the callback
+///     function. Triton will not longer access the 'request' object
+///     itself nor any input tensor data associated with the
+///     request. The callback should free or otherwise manage the
+///     'request' object and all associated tensor data.
+///
+/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
+/// be set when the callback is invoked but in the future that may
+/// change, so the callback should explicitly check for the flag
+/// before taking ownership of the request object.
+///
+typedef void (*TRITONSERVER_InferenceRequestReleaseFn_t)(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);
+
+/// Type for callback function indicating that an inference response
+/// has completed. The callback function takes ownership of the
+/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
+/// data provided as 'response_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+///
+/// One or more flags may be specified when the callback is invoked:
+///
+///   - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
+///     responses will be generated for a given request (more
+///     specifically, that no more responses will be generated for the
+///     inference request that set this callback and 'userp'). When
+///     this flag is set 'response' may be a response object or may be
+///     nullptr. If 'response' is not nullptr, then 'response' is the
+///     last response that Triton will produce for the request. If
+///     'response' is nullptr then Triton is indicating that no more
+///     responses will be produced for the request.
+typedef void (*TRITONSERVER_InferenceResponseCompleteFn_t)(
+    TRITONSERVER_InferenceResponse* response, const uint32_t flags,
+    void* userp);
+
+/// Create a new inference request object.
+///
+/// \param inference_request Returns the new request object.
+/// \param server the inference server object.
+/// \param model_name The name of the model to use for the request.
+/// \param model_version The version of the model to use for the
+/// request. If -1 then the server will choose a version based on the
+/// model's policy.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestNew(
+    TRITONSERVER_InferenceRequest** inference_request,
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version);
+
+/// Delete an inference request object.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestDelete(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Get the ID for a request. The returned ID is owned by
+/// 'inference_request' and must not be modified or freed by the
+/// caller.
+///
+/// \param inference_request The request object.
+/// \param id Returns the ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestId(
+    TRITONSERVER_InferenceRequest* inference_request, const char** id);
+
+/// Set the ID for a request.
+///
+/// \param inference_request The request object.
+/// \param id The ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetId(
+    TRITONSERVER_InferenceRequest* inference_request, const char* id);
+
+/// Get the flag(s) associated with a request. On return 'flags' holds
+/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
+/// available flags.
+///
+/// \param inference_request The request object.
+/// \param flags Returns the flags.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestFlags(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t* flags);
+
+/// Set the flag(s) associated with a request. 'flags' should hold a
+/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
+/// available flags.
+///
+/// \param inference_request The request object.
+/// \param flags The flags.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetFlags(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
+
+/// Get the correlation ID of the inference request as an unsigned integer.
+/// Default is 0, which indicates that the request has no correlation ID.
+/// If the correlation id associated with the inference request is a string,
+/// this function will return a failure. The correlation ID is used
+/// to indicate two or more inference request are related to each other.
+/// How this relationship is handled by the inference server is determined by
+/// the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestCorrelationId(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t* correlation_id);
+
+/// Get the correlation ID of the inference request as a string.
+/// Default is empty "", which indicates that the request has no correlation ID.
+/// If the correlation id associated with the inference request is an unsigned
+/// integer, then this function will return a failure. The correlation ID
+/// is used to indicate two or more inference request are related to each other.
+/// How this relationship is handled by the inference server is determined by
+/// the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestCorrelationIdString(
+    TRITONSERVER_InferenceRequest* inference_request,
+    const char** correlation_id);
+
+/// Set the correlation ID of the inference request to be an unsigned integer.
+/// Default is 0, which indicates that the request has no correlation ID.
+/// The correlation ID is used to indicate two or more inference request
+/// are related to each other. How this relationship is handled by the
+/// inference server is determined by the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id The correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetCorrelationId(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t correlation_id);
+
+/// Set the correlation ID of the inference request to be a string.
+/// The correlation ID is used to indicate two or more inference
+/// request are related to each other. How this relationship is
+/// handled by the inference server is determined by the model's
+/// scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id The correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetCorrelationIdString(
+    TRITONSERVER_InferenceRequest* inference_request,
+    const char* correlation_id);
+
+/// Get the priority for a request. The default is 0 indicating that
+/// the request does not specify a priority and so will use the
+/// model's default priority.
+///
+/// \param inference_request The request object.
+/// \param priority Returns the priority level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestPriority(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t* priority);
+
+/// Set the priority for a request. The default is 0 indicating that
+/// the request does not specify a priority and so will use the
+/// model's default priority.
+///
+/// \param inference_request The request object.
+/// \param priority The priority level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetPriority(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t priority);
+
+/// Get the timeout for a request, in microseconds. The default is 0
+/// which indicates that the request has no timeout.
+///
+/// \param inference_request The request object.
+/// \param timeout_us Returns the timeout, in microseconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestTimeoutMicroseconds(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t* timeout_us);
+
+/// Set the timeout for a request, in microseconds. The default is 0
+/// which indicates that the request has no timeout.
+///
+/// \param inference_request The request object.
+/// \param timeout_us The timeout, in microseconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
+
+/// Add an input to a request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param datatype The type of the input. Valid type names are BOOL,
+/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
+/// FP32, FP64, and BYTES.
+/// \param shape The shape of the input.
+/// \param dim_count The number of dimensions of 'shape'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestAddInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const TRITONSERVER_DataType datatype, const int64_t* shape,
+    uint64_t dim_count);
+
+/// Add a raw input to a request. The name recognized by the model, data type
+/// and shape of the input will be deduced from model configuration.
+/// This function must be called at most once on request with no other input to
+/// ensure the deduction is accurate.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input. This name is only used as a reference
+/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
+/// name used in the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAddRawInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove an input from a request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove all inputs from a request.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllInputs(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Assign a buffer of data to an input. The buffer will be appended
+/// to any existing buffers for that input. The 'inference_request'
+/// object takes ownership of the buffer and so the caller should not
+/// modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed
+/// from 'inference_request'.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param byte_size The size, in bytes, of the input data.
+/// \param memory_type The memory type of the input data.
+/// \param memory_type_id The memory type id of the input data.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputData(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id);
+
+/// Assign a buffer of data to an input for execution on all model instances
+/// with the specified host policy. The buffer will be appended to any existing
+/// buffers for that input on all devices with this host policy. The
+/// 'inference_request' object takes ownership of the buffer and so the caller
+/// should not modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed from
+/// 'inference_request'. If the execution is scheduled on a device that does not
+/// have a input buffer specified using this function, then the input buffer
+/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
+/// a non-host policy specific version of data must be added using that API.
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param byte_size The size, in bytes, of the input data.
+/// \param memory_type The memory type of the input data.
+/// \param memory_type_id The memory type id of the input data.
+/// \param host_policy_name All model instances executing with this host_policy
+/// will use this input buffer for execution.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, const char* host_policy_name);
+
+/// Assign a buffer of data to an input. The buffer will be appended
+/// to any existing buffers for that input. The 'inference_request'
+/// object takes ownership of the buffer and so the caller should not
+/// modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed
+/// from 'inference_request'.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param buffer_attributes The buffer attrubutes of the input.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, TRITONSERVER_BufferAttributes* buffer_attributes);
+
+/// Clear all input data from an input, releasing ownership of the
+/// buffer(s) that were appended to the input with
+/// TRITONSERVER_InferenceRequestAppendInputData or
+/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
+/// \param inference_request The request object.
+/// \param name The name of the input.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllInputData(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Add an output request to an inference request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the output.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAddRequestedOutput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove an output request from an inference request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the output.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveRequestedOutput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove all output requests from an inference request.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Set the release callback for an inference request. The release
+/// callback is called by Triton to return ownership of the request
+/// object.
+///
+/// \param inference_request The request object.
+/// \param request_release_fn The function called to return ownership
+/// of the 'inference_request' object.
+/// \param request_release_userp User-provided pointer that is
+/// delivered to the 'request_release_fn' callback.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetReleaseCallback(
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
+    void* request_release_userp);
+
+/// Set the allocator and response callback for an inference
+/// request. The allocator is used to allocate buffers for any output
+/// tensors included in responses that are produced for this
+/// request. The response callback is called to return response
+/// objects representing responses produced for this request.
+///
+/// \param inference_request The request object.
+/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
+/// to allocate buffers to hold inference results.
+/// \param response_allocator_userp User-provided pointer that is
+/// delivered to the response allocator's start and allocation functions.
+/// \param response_fn The function called to deliver an inference
+/// response for this request.
+/// \param response_userp User-provided pointer that is delivered to
+/// the 'response_fn' callback.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetResponseCallback(
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_ResponseAllocator* response_allocator,
+    void* response_allocator_userp,
+    TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
+    void* response_userp);
+
+/// TRITONSERVER_InferenceResponse
+///
+/// Object representing an inference response. The inference response
+/// provides the meta-data and output tensor values calculated by the
+/// inference.
+///
+
+/// Delete an inference response object.
+///
+/// \param inference_response The response object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseDelete(
+    TRITONSERVER_InferenceResponse* inference_response);
+
+/// Return the error status of an inference response. Return a
+/// TRITONSERVER_Error object on failure, return nullptr on success.
+/// The returned error object is owned by 'inference_response' and so
+/// should not be deleted by the caller.
+///
+/// \param inference_response The response object.
+/// \return a TRITONSERVER_Error indicating the success or failure
+/// status of the response.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseError(
+    TRITONSERVER_InferenceResponse* inference_response);
+
+/// Get model used to produce a response. The caller does not own the
+/// returned model name value and must not modify or delete it. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// \param inference_response The response object.
+/// \param model_name Returns the name of the model.
+/// \param model_version Returns the version of the model.
+/// this response.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseModel(
+    TRITONSERVER_InferenceResponse* inference_response, const char** model_name,
+    int64_t* model_version);
+
+/// Get the ID of the request corresponding to a response. The caller
+/// does not own the returned ID and must not modify or delete it. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// \param inference_response The response object.
+/// \param request_id Returns the ID of the request corresponding to
+/// this response.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseId(
+    TRITONSERVER_InferenceResponse* inference_response,
+    const char** request_id);
+
+/// Get the number of parameters available in the response.
+///
+/// \param inference_response The response object.
+/// \param count Returns the number of parameters.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseParameterCount(
+    TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
+
+/// Get all information about a parameter. The caller does not own any
+/// of the returned values and must not modify or delete them. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// The 'vvalue' returns a void* pointer that must be cast
+/// appropriately based on 'type'. For example:
+///
+///   void* vvalue;
+///   TRITONSERVER_ParameterType type;
+///   TRITONSERVER_InferenceResponseParameter(
+///                     response, index, &name, &type, &vvalue);
+///   switch (type) {
+///     case TRITONSERVER_PARAMETER_BOOL:
+///       bool value = *(reinterpret_cast<bool*>(vvalue));
+///       ...
+///     case TRITONSERVER_PARAMETER_INT:
+///       int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
+///       ...
+///     case TRITONSERVER_PARAMETER_STRING:
+///       const char* value = reinterpret_cast<const char*>(vvalue);
+///       ...
+///
+/// \param inference_response The response object.
+/// \param index The index of the parameter, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseParameterCount.
+/// \param name Returns the name of the parameter.
+/// \param type Returns the type of the parameter.
+/// \param vvalue Returns a pointer to the parameter value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseParameter(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const char** name, TRITONSERVER_ParameterType* type, const void** vvalue);
+
+/// Get the number of outputs available in the response.
+///
+/// \param inference_response The response object.
+/// \param count Returns the number of output tensors.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseOutputCount(
+    TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
+
+/// Get all information about an output tensor.  The tensor data is
+/// returned as the base pointer to the data and the size, in bytes,
+/// of the data. The caller does not own any of the returned values
+/// and must not modify or delete them. The lifetime of all returned
+/// values extends until 'inference_response' is deleted.
+///
+/// \param inference_response The response object.
+/// \param index The index of the output tensor, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseOutputCount.
+/// \param name Returns the name of the output.
+/// \param datatype Returns the type of the output.
+/// \param shape Returns the shape of the output.
+/// \param dim_count Returns the number of dimensions of the returned
+/// shape.
+/// \param base Returns the tensor data for the output.
+/// \param byte_size Returns the size, in bytes, of the data.
+/// \param memory_type Returns the memory type of the data.
+/// \param memory_type_id Returns the memory type id of the data.
+/// \param userp The user-specified value associated with the buffer
+/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseOutput(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint64_t* dim_count, const void** base, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
+    void** userp);
+
+/// Get a classification label associated with an output for a given
+/// index.  The caller does not own the returned label and must not
+/// modify or delete it. The lifetime of all returned label extends
+/// until 'inference_response' is deleted.
+///
+/// \param inference_response The response object.
+/// \param index The index of the output tensor, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseOutputCount.
+/// \param class_index The index of the class.
+/// \param name Returns the label corresponding to 'class_index' or
+/// nullptr if no label.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseOutputClassificationLabel(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const size_t class_index, const char** label);
+
+/// TRITONSERVER_BufferAttributes
+///
+/// API to create, modify, or retrieve attributes associated with a buffer.
+///
+
+/// Create a new buffer attributes object. The caller takes ownership of
+/// the TRITONSERVER_BufferAttributes object and must call
+/// TRITONSERVER_BufferAttributesDelete to release the object.
+///
+/// \param buffer_attributes Returns the new buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesNew(
+    TRITONSERVER_BufferAttributes** buffer_attributes);
+
+/// Delete a buffer attributes object.
+///
+/// \param buffer_attributes The buffer_attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesDelete(
+    TRITONSERVER_BufferAttributes* buffer_attributes);
+
+/// Set the memory type id field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type_id Memory type id to assign to the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetMemoryTypeId(
+    TRITONSERVER_BufferAttributes* buffer_attributes, int64_t memory_type_id);
+
+/// Set the memory type field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type Memory type to assign to the buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetMemoryType(
+    TRITONSERVER_BufferAttributes* buffer_attributes,
+    TRITONSERVER_MemoryType memory_type);
+
+/// Set the CudaIpcHandle field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetCudaIpcHandle(
+    TRITONSERVER_BufferAttributes* buffer_attributes, void* cuda_ipc_handle);
+
+/// Set the byte size field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param byte_size Byte size to assign to the buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetByteSize(
+    TRITONSERVER_BufferAttributes* buffer_attributes, size_t byte_size);
+
+/// Get the memory type id field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type_id Returns the memory type id associated with the buffer
+/// attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesMemoryTypeId(
+    TRITONSERVER_BufferAttributes* buffer_attributes, int64_t* memory_type_id);
+
+/// Get the memory type field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type Returns the memory type associated with the buffer
+/// attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesMemoryType(
+    TRITONSERVER_BufferAttributes* buffer_attributes,
+    TRITONSERVER_MemoryType* memory_type);
+
+/// Get the CudaIpcHandle field of the buffer attributes object.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param cuda_ipc_handle Returns the memory type associated with the buffer
+/// attributes object. If the cudaIpcHandle does not exist for the buffer,
+/// nullptr will be returned.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesCudaIpcHandle(
+    TRITONSERVER_BufferAttributes* buffer_attributes, void** cuda_ipc_handle);
+
+/// Get the byte size field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param byte_size Returns the byte size associated with the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesByteSize(
+    TRITONSERVER_BufferAttributes* buffer_attributes, size_t* byte_size);
+
+
+/// TRITONSERVER_ServerOptions
+///
+/// Options to use when creating an inference server.
+///
+
+/// Model control modes
+typedef enum tritonserver_modelcontrolmode_enum {
+  TRITONSERVER_MODEL_CONTROL_NONE,
+  TRITONSERVER_MODEL_CONTROL_POLL,
+  TRITONSERVER_MODEL_CONTROL_EXPLICIT
+} TRITONSERVER_ModelControlMode;
+
+/// Rate limit modes
+typedef enum tritonserver_ratelimitmode_enum {
+  TRITONSERVER_RATE_LIMIT_OFF,
+  TRITONSERVER_RATE_LIMIT_EXEC_COUNT
+} TRITONSERVER_RateLimitMode;
+
+/// Create a new server options object. The caller takes ownership of
+/// the TRITONSERVER_ServerOptions object and must call
+/// TRITONSERVER_ServerOptionsDelete to release the object.
+///
+/// \param options Returns the new server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsNew(
+    TRITONSERVER_ServerOptions** options);
+
+/// Delete a server options object.
+///
+/// \param options The server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsDelete(
+    TRITONSERVER_ServerOptions* options);
+
+/// Set the textual ID for the server in a server options. The ID is a
+/// name that identifies the server.
+///
+/// \param options The server options object.
+/// \param server_id The server identifier.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetServerId(
+    TRITONSERVER_ServerOptions* options, const char* server_id);
+
+/// Set the model repository path in a server options. The path must be
+/// the full absolute path to the model repository. This function can be called
+/// multiple times with different paths to set multiple model repositories.
+/// Note that if a model is not unique across all model repositories
+/// at any time, the model will not be available.
+///
+/// \param options The server options object.
+/// \param model_repository_path The full path to the model repository.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelRepositoryPath(
+    TRITONSERVER_ServerOptions* options, const char* model_repository_path);
+
+/// Set the model control mode in a server options. For each mode the models
+/// will be managed as the following:
+///
+///   TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
+///   loaded on startup. After startup any changes to the model repository will
+///   be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
+///   an error.
+///
+///   TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
+///   loaded on startup. The model repository can be polled periodically using
+///   TRITONSERVER_ServerPollModelRepository and the server will load, unload,
+///   and updated models according to changes in the model repository.
+///
+///   TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
+///   not be loaded on startup. The corresponding model control APIs must be
+///   called to load / unload a model in the model repository.
+///
+/// \param options The server options object.
+/// \param mode The mode to use for the model control.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelControlMode(
+    TRITONSERVER_ServerOptions* options, TRITONSERVER_ModelControlMode mode);
+
+/// Set the model to be loaded at startup in a server options. The model must be
+/// present in one, and only one, of the specified model repositories.
+/// This function can be called multiple times with different model name
+/// to set multiple startup models.
+/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
+///
+/// \param options The server options object.
+/// \param mode_name The name of the model to load on startup.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStartupModel(
+    TRITONSERVER_ServerOptions* options, const char* model_name);
+
+/// Enable or disable strict model configuration handling in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param strict True to enable strict model configuration handling,
+/// false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStrictModelConfig(
+    TRITONSERVER_ServerOptions* options, bool strict);
+
+/// Set the rate limit mode in a server options.
+///
+///   TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
+///   inference execution using the number of times each instance has got a
+///   chance to run. The execution gets to run only when its resource
+///   constraints are satisfied.
+///
+///   TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
+///   inference gets executed whenever an instance is available.
+///
+/// \param options The server options object.
+/// \param mode The mode to use for the rate limiting. By default, execution
+/// count is used to determine the priorities.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetRateLimiterMode(
+    TRITONSERVER_ServerOptions* options, TRITONSERVER_RateLimitMode mode);
+
+/// Add resource count for rate limiting.
+///
+/// \param options The server options object.
+/// \param name The name of the resource.
+/// \param count The count of the resource.
+/// \param device The device identifier for the resource. A value of -1
+/// indicates that the specified number of resources are available on every
+/// device. The device value is ignored for a global resource. The server
+/// will use the rate limiter configuration specified for instance groups
+/// in model config to determine whether resource is global. In case of
+/// conflicting resource type in different model configurations, server
+/// will raise an appropriate error while loading model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsAddRateLimiterResource(
+    TRITONSERVER_ServerOptions* options, const char* resource_name,
+    const size_t resource_count, const int device);
+
+/// Set the total pinned memory byte size that the server can allocate
+/// in a server options. The pinned memory pool will be shared across
+/// Triton itself and the backends that use
+/// TRITONBACKEND_MemoryManager to allocate memory.
+///
+/// \param options The server options object.
+/// \param size The pinned memory pool byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize(
+    TRITONSERVER_ServerOptions* options, uint64_t size);
+
+/// Set the total CUDA memory byte size that the server can allocate
+/// on given GPU device in a server options. The pinned memory pool
+/// will be shared across Triton itself and the backends that use
+/// TRITONBACKEND_MemoryManager to allocate memory.
+///
+/// \param options The server options object.
+/// \param gpu_device The GPU device to allocate the memory pool.
+/// \param size The CUDA memory pool byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize(
+    TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
+
+/// Set the total response cache byte size that the server can allocate in CPU
+/// memory. The response cache will be shared across all inference requests and
+/// across all models.
+///
+/// \param options The server options object.
+/// \param size The total response cache byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetResponseCacheByteSize(
+    TRITONSERVER_ServerOptions* options, uint64_t size);
+
+/// Set the minimum support CUDA compute capability in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param cc The minimum CUDA compute capability.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+    TRITONSERVER_ServerOptions* options, double cc);
+
+/// Enable or disable exit-on-error in a server options.
+///
+/// \param options The server options object.
+/// \param exit True to enable exiting on intialization error, false
+/// to continue.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetExitOnError(
+    TRITONSERVER_ServerOptions* options, bool exit);
+
+/// Enable or disable strict readiness handling in a server options.
+///
+/// \param options The server options object.
+/// \param strict True to enable strict readiness handling, false to
+/// disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStrictReadiness(
+    TRITONSERVER_ServerOptions* options, bool strict);
+
+/// Set the exit timeout, in seconds, for the server in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param timeout The exit timeout, in seconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetExitTimeout(
+    TRITONSERVER_ServerOptions* options, unsigned int timeout);
+
+/// Set the number of threads used in buffer manager in a server options.
+///
+/// \param options The server options object.
+/// \param thread_count The number of threads.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBufferManagerThreadCount(
+    TRITONSERVER_ServerOptions* options, unsigned int thread_count);
+
+/// Set the number of threads to concurrently load models in a server options.
+///
+/// \param options The server options object.
+/// \param thread_count The number of threads.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelLoadThreadCount(
+    TRITONSERVER_ServerOptions* options, unsigned int thread_count);
+
+/// Provide a log output file.
+///
+/// \param options The server options object.
+/// \param file a string defining the file where the log outputs will be saved.
+/// An empty string for the file name will cause triton to direct logging
+/// facilities to the console
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogFile(
+    TRITONSERVER_ServerOptions* options, const char* file);
+
+/// Enable or disable info level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable info logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogInfo(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Enable or disable warning level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable warning logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogWarn(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Enable or disable error level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable error logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogError(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Set the logging format.
+///
+/// \param options The server options object.
+/// \param format The logging format.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetLogFormat(
+    TRITONSERVER_ServerOptions* options, const TRITONSERVER_LogFormat format);
+
+/// Set verbose logging level. Level zero disables verbose logging.
+///
+/// \param options The server options object.
+/// \param level The verbose logging level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetLogVerbose(
+    TRITONSERVER_ServerOptions* options, int level);
+
+/// Enable or disable metrics collection in a server options.
+///
+/// \param options The server options object.
+/// \param metrics True to enable metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetMetrics(
+    TRITONSERVER_ServerOptions* options, bool metrics);
+
+/// Enable or disable GPU metrics collection in a server options. GPU
+/// metrics are collected if both this option and
+/// TRITONSERVER_ServerOptionsSetMetrics are true.
+///
+/// \param options The server options object.
+/// \param gpu_metrics True to enable GPU metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetGpuMetrics(
+    TRITONSERVER_ServerOptions* options, bool gpu_metrics);
+
+/// Enable or disable CPU metrics collection in a server options. CPU
+/// metrics are collected if both this option and
+/// TRITONSERVER_ServerOptionsSetMetrics are true.
+///
+/// \param options The server options object.
+/// \param cpu_metrics True to enable CPU metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetCpuMetrics(
+    TRITONSERVER_ServerOptions* options, bool cpu_metrics);
+
+/// Set the interval for metrics collection in a server options.
+/// This is 2000 milliseconds by default.
+///
+/// \param options The server options object.
+/// \param metrics_interval_ms The time interval in ms between
+/// successive metrics updates.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetMetricsInterval(
+    TRITONSERVER_ServerOptions* options, uint64_t metrics_interval_ms);
+
+/// Set the directory containing backend shared libraries. This
+/// directory is searched last after the version and model directory
+/// in the model repository when looking for the backend shared
+/// library for a model. If the backend is named 'be' the directory
+/// searched is 'backend_dir'/be/libtriton_be.so.
+///
+/// \param options The server options object.
+/// \param backend_dir The full path of the backend directory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBackendDirectory(
+    TRITONSERVER_ServerOptions* options, const char* backend_dir);
+
+/// Set the directory containing repository agent shared libraries. This
+/// directory is searched when looking for the repository agent shared
+/// library for a model. If the backend is named 'ra' the directory
+/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
+///
+/// \param options The server options object.
+/// \param repoagent_dir The full path of the repository agent directory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+    TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
+
+/// Specify the limit on memory usage as a fraction on the device identified by
+/// 'kind' and 'device_id'. If model loading on the device is requested and the
+/// current memory usage exceeds the limit, the load will be rejected. If not
+/// specified, the limit will not be set.
+///
+/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
+///
+/// \param options The server options object.
+/// \param kind The kind of the device.
+/// \param device_id The id of the device.
+/// \param fraction The limit on memory usage as a fraction
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit(
+    TRITONSERVER_ServerOptions* options,
+    const TRITONSERVER_InstanceGroupKind kind, const int device_id,
+    const double fraction);
+
+/// Set a configuration setting for a named backend in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param backend_name The name of the backend.
+/// \param setting The name of the setting.
+/// \param value The setting value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBackendConfig(
+    TRITONSERVER_ServerOptions* options, const char* backend_name,
+    const char* setting, const char* value);
+
+/// Set a host policy setting for a given policy name in a server options.
+///
+/// \param options The server options object.
+/// \param policy_name The name of the policy.
+/// \param setting The name of the setting.
+/// \param value The setting value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetHostPolicy(
+    TRITONSERVER_ServerOptions* options, const char* policy_name,
+    const char* setting, const char* value);
+
+/// TRITONSERVER_Server
+///
+/// An inference server.
+///
+
+/// Model batch flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_batchflag_enum {
+  TRITONSERVER_BATCH_UNKNOWN = 1,
+  TRITONSERVER_BATCH_FIRST_DIM = 2
+} TRITONSERVER_ModelBatchFlag;
+
+/// Model index flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_modelindexflag_enum {
+  TRITONSERVER_INDEX_FLAG_READY = 1
+} TRITONSERVER_ModelIndexFlag;
+
+/// Model transaction policy flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_txn_property_flag_enum {
+  TRITONSERVER_TXN_ONE_TO_ONE = 1,
+  TRITONSERVER_TXN_DECOUPLED = 2
+} TRITONSERVER_ModelTxnPropertyFlag;
+
+/// Create a new server object. The caller takes ownership of the
+/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
+/// to release the object.
+///
+/// \param server Returns the new inference server object.
+/// \param options The inference server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerNew(
+    TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* options);
+
+/// Delete a server object. If server is not already stopped it is
+/// stopped before being deleted.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerDelete(
+    TRITONSERVER_Server* server);
+
+/// Stop a server object. A server can't be restarted once it is
+/// stopped.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerStop(
+    TRITONSERVER_Server* server);
+
+/// Register a new model repository. Not available in polling mode.
+///
+/// \param server The inference server object.
+/// \param repository_path The full path to the model repository.
+/// \param name_mapping List of name_mapping parameters. Each mapping has
+/// the model directory name as its key, overriden model name as its value.
+/// \param model_count Number of mappings provided.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerRegisterModelRepository(
+    TRITONSERVER_Server* server, const char* repository_path,
+    const TRITONSERVER_Parameter** name_mapping, const uint32_t mapping_count);
+
+/// Unregister a model repository. Not available in polling mode.
+///
+/// \param server The inference server object.
+/// \param repository_path The full path to the model repository.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerUnregisterModelRepository(
+    TRITONSERVER_Server* server, const char* repository_path);
+
+/// Check the model repository for changes and update server state
+/// based on those changes.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerPollModelRepository(TRITONSERVER_Server* server);
+
+/// Is the server live?
+///
+/// \param server The inference server object.
+/// \param live Returns true if server is live, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsLive(
+    TRITONSERVER_Server* server, bool* live);
+
+/// Is the server ready?
+///
+/// \param server The inference server object.
+/// \param ready Returns true if server is ready, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsReady(
+    TRITONSERVER_Server* server, bool* ready);
+
+/// Is the model ready?
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model to get readiness for.
+/// \param model_version The version of the model to get readiness
+/// for.  If -1 then the server will choose a version based on the
+/// model's policy.
+/// \param ready Returns true if server is ready, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIsReady(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, bool* ready);
+
+/// Get the batch properties of the model. The properties are
+/// communicated by a flags value and an (optional) object returned by
+/// 'voidp'.
+///
+///   - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
+///     batching properties of the model. This means that the model
+///     does not support batching in any way that is useable by
+///     Triton. The returned 'voidp' value is nullptr.
+///
+///   - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
+///     along the first dimension of every input and output
+///     tensor. Triton schedulers that perform batching can
+///     automatically batch inference requests along this dimension.
+///     The returned 'voidp' value is nullptr.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param flags Returns flags indicating the batch properties of the
+/// model.
+/// \param voidp If non-nullptr, returns a point specific to the
+/// 'flags' value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerModelBatchProperties(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, uint32_t* flags, void** voidp);
+
+/// Get the transaction policy of the model. The policy is
+/// communicated by a flags value.
+///
+///   - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
+///     one response per request.
+///
+///   - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
+///     to many responses per request.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param txn_flags Returns flags indicating the transaction policy of the
+/// model.
+/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerModelTransactionProperties(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, uint32_t* txn_flags, void** voidp);
+
+/// Get the metadata of the server as a TRITONSERVER_Message object.
+/// The caller takes ownership of the message object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param server_metadata Returns the server metadata message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetadata(
+    TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
+
+/// Get the metadata of a model as a TRITONSERVER_Message
+/// object.  The caller takes ownership of the message object and must
+/// call TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.
+/// If -1 then the server will choose a version based on the model's
+/// policy.
+/// \param model_metadata Returns the model metadata message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelMetadata(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, TRITONSERVER_Message** model_metadata);
+
+/// Get the statistics of a model as a TRITONSERVER_Message
+/// object. The caller takes ownership of the object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// If empty, then statistics for all available models will be returned,
+/// and the server will choose a version based on those models' policies.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param model_stats Returns the model statistics message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelStatistics(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, TRITONSERVER_Message** model_stats);
+
+/// Get the configuration of a model as a TRITONSERVER_Message object.
+/// The caller takes ownership of the message object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned. Currently only version 1 is supported.
+/// \param model_config Returns the model config message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelConfig(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, const uint32_t config_version,
+    TRITONSERVER_Message** model_config);
+
+/// Get the index of all unique models in the model repositories as a
+/// TRITONSERVER_Message object. The caller takes ownership of the
+/// message object and must call TRITONSERVER_MessageDelete to release
+/// the object.
+///
+/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
+/// that are loaded into the server and ready for inferencing are
+/// returned.
+///
+/// \param server The inference server object.
+/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
+/// collect the index.
+/// \param model_index Return the model index message that holds the
+/// index of all models contained in the server's model repository(s).
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIndex(
+    TRITONSERVER_Server* server, uint32_t flags,
+    TRITONSERVER_Message** model_index);
+
+/// Load the requested model or reload the model if it is already
+/// loaded. The function does not return until the model is loaded or
+/// fails to load. Returned error indicates if model loaded
+/// successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerLoadModel(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Load the requested model or reload the model if it is already
+/// loaded, with load parameters provided. The function does not return until
+/// the model is loaded or fails to load. Returned error indicates if model
+/// loaded successfully or not.
+/// Currently the below parameter names are recognized:
+/// - "config" : string parameter that contains a JSON representation of the
+/// model configuration. This config will be used for loading the model instead
+/// of the one in the model directory.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param parameters The array of load parameters.
+/// \param parameter_count The number of parameters.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerLoadModelWithParameters(
+    TRITONSERVER_Server* server, const char* model_name,
+    const TRITONSERVER_Parameter** parameters, const uint64_t parameter_count);
+
+/// Unload the requested model. Unloading a model that is not loaded
+/// on server has no affect and success code will be returned.
+/// The function does not wait for the requested model to be fully unload
+/// and success code will be returned.
+/// Returned error indicates if model unloaded successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerUnloadModel(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Unload the requested model, and also unload any dependent model that
+/// was loaded along with the requested model (for example, the models composing
+/// an ensemble). Unloading a model that is not loaded
+/// on server has no affect and success code will be returned.
+/// The function does not wait for the requested model and all dependent
+/// models to be fully unload and success code will be returned.
+/// Returned error indicates if model unloaded successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerUnloadModelAndDependents(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Get the current metrics for the server. The caller takes ownership
+/// of the metrics object and must call TRITONSERVER_MetricsDelete to
+/// release the object.
+///
+/// \param server The inference server object.
+/// \param metrics Returns the metrics.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetrics(
+    TRITONSERVER_Server* server, TRITONSERVER_Metrics** metrics);
+
+/// Perform inference using the meta-data and inputs supplied by the
+/// 'inference_request'. If the function returns success, then the
+/// caller releases ownership of 'inference_request' and must not
+/// access it in any way after this call, until ownership is returned
+/// via the 'request_release_fn' callback registered in the request
+/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
+///
+/// The function unconditionally takes ownership of 'trace' and so the
+/// caller must not access it in any way after this call (except in
+/// the trace activity callbacks) until ownership is returned via the
+/// trace's release_fn callback.
+///
+/// Responses produced for this request are returned using the
+/// allocator and callback registered with the request by
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+///
+/// \param server The inference server object.
+/// \param inference_request The request object.
+/// \param trace The trace object for this request, or nullptr if no
+/// tracing.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
+    TRITONSERVER_Server* server,
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_InferenceTrace* trace);
+
+/// TRITONSERVER_MetricKind
+///
+/// Types of metrics recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_metrickind_enum {
+  TRITONSERVER_METRIC_KIND_COUNTER,
+  TRITONSERVER_METRIC_KIND_GAUGE
+} TRITONSERVER_MetricKind;
+
+/// Create a new metric family object. The caller takes ownership of the
+/// TRITONSERVER_MetricFamily object and must call
+/// TRITONSERVER_MetricFamilyDelete to release the object.
+///
+/// \param family Returns the new metric family object.
+/// \param kind The type of metric family to create.
+/// \param name The name of the metric family seen when calling the metrics
+/// endpoint.
+/// \param description The description of the metric family seen when
+/// calling the metrics endpoint.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
+    TRITONSERVER_MetricFamily** family, const TRITONSERVER_MetricKind kind,
+    const char* name, const char* description);
+
+/// Delete a metric family object.
+/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
+/// corresponding TRITONSERVER_Metric* objects have been deleted.
+/// Attempting to delete a family before its metrics will return an error.
+///
+/// \param family The metric family object to delete.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyDelete(
+    TRITONSERVER_MetricFamily* family);
+
+/// Create a new metric object. The caller takes ownership of the
+/// TRITONSERVER_Metric object and must call
+/// TRITONSERVER_MetricDelete to release the object. The caller is also
+/// responsible for ownership of the labels passed in. Each label can be deleted
+/// immediately after creating the metric with TRITONSERVER_ParameterDelete
+/// if not re-using the labels.
+///
+/// \param metric Returns the new metric object.
+/// \param family The metric family to add this new metric to.
+/// \param labels The array of labels to associate with this new metric.
+/// \param label_count The number of labels.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricNew(
+    TRITONSERVER_Metric** metric, TRITONSERVER_MetricFamily* family,
+    const TRITONSERVER_Parameter** labels, const uint64_t label_count);
+
+/// Delete a metric object.
+/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
+/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
+/// If a family is deleted before its metrics, an error will be returned.
+///
+/// \param metric The metric object to delete.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricDelete(
+    TRITONSERVER_Metric* metric);
+
+/// Get the current value of a metric object.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
+/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
+///
+/// \param metric The metric object to query.
+/// \param value Returns the current value of the metric object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricValue(
+    TRITONSERVER_Metric* metric, double* value);
+
+/// Increment the current value of metric by value.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
+/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
+/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
+/// TRITONSERVER_METRIC_KIND_COUNTER metric.
+///
+/// \param metric The metric object to update.
+/// \param value The amount to increment the metric's value by.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
+    TRITONSERVER_Metric* metric, double value);
+
+/// Set the current value of metric to value.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
+///
+/// \param metric The metric object to update.
+/// \param value The amount to set metric's value to.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricSet(
+    TRITONSERVER_Metric* metric, double value);
+
+/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
+///
+/// \param metric The metric object to query.
+/// \param kind Returns the TRITONSERVER_MetricKind of metric.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_GetMetricKind(
+    TRITONSERVER_Metric* metric, TRITONSERVER_MetricKind* kind);
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/core-r22.12/src/backend_config.cc
+++ b/3rdparty/core-r22.12/src/backend_config.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_config.h"
+
+#include "status.h"
+#include "triton/common/logging.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+namespace {
+
+Status
+GetTFSpecializedBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    std::string* specialized_name)
+{
+  std::string tf_version_str = "2";
+  const auto& itr = config_map.find("tensorflow");
+  if (itr != config_map.end()) {
+    if (BackendConfiguration(itr->second, "version", &tf_version_str).IsOk()) {
+      if ((tf_version_str != "1") && (tf_version_str != "2")) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unexpected TensorFlow library version '" + tf_version_str +
+                "', expects 1 or 2.");
+      }
+    }
+  }
+
+  *specialized_name += tf_version_str;
+
+  return Status::Success;
+}
+}  // namespace
+
+Status
+BackendConfiguration(
+    const triton::common::BackendCmdlineConfig& config, const std::string& key,
+    std::string* val)
+{
+  for (const auto& pr : config) {
+    if (pr.first == key) {
+      *val = pr.second;
+      return Status::Success;
+    }
+  }
+
+  return Status(
+      Status::Code::INTERNAL,
+      std::string("unable to find common backend configuration for '") + key +
+          "'");
+}
+
+Status
+BackendConfigurationParseStringToDouble(const std::string& str, double* val)
+{
+  try {
+    *val = std::stod(str);
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to parse common backend configuration as double");
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationParseStringToBool(const std::string& str, bool* val)
+{
+  try {
+    std::string lowercase_str{str};
+    std::transform(
+        lowercase_str.begin(), lowercase_str.end(), lowercase_str.begin(),
+        [](unsigned char c) { return std::tolower(c); });
+    *val = (lowercase_str == "true");
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to parse common backend configuration as bool");
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationGlobalBackendsDirectory(
+    const triton::common::BackendCmdlineConfigMap& config_map, std::string* dir)
+{
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to find global backends directory configuration");
+  }
+
+  RETURN_IF_ERROR(BackendConfiguration(itr->second, "backend-directory", dir));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationMinComputeCapability(
+    const triton::common::BackendCmdlineConfigMap& config_map, double* mcc)
+{
+#ifdef TRITON_ENABLE_GPU
+  *mcc = TRITON_MIN_COMPUTE_CAPABILITY;
+#else
+  *mcc = 0;
+#endif  // TRITON_ENABLE_GPU
+
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL, "unable to find common backend configuration");
+  }
+
+  std::string min_compute_capability_str;
+  RETURN_IF_ERROR(BackendConfiguration(
+      itr->second, "min-compute-capability", &min_compute_capability_str));
+  RETURN_IF_ERROR(
+      BackendConfigurationParseStringToDouble(min_compute_capability_str, mcc));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationAutoCompleteConfig(
+    const triton::common::BackendCmdlineConfigMap& config_map, bool* acc)
+{
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL, "unable to find auto-complete configuration");
+  }
+
+  std::string auto_complete_config_str;
+  RETURN_IF_ERROR(BackendConfiguration(
+      itr->second, "auto-complete-config", &auto_complete_config_str));
+  RETURN_IF_ERROR(
+      BackendConfigurationParseStringToBool(auto_complete_config_str, acc));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationSpecializeBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const std::string& backend_name, std::string* specialized_name)
+{
+  *specialized_name = backend_name;
+  if (backend_name == "tensorflow") {
+    RETURN_IF_ERROR(GetTFSpecializedBackendName(config_map, specialized_name));
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationBackendLibraryName(
+    const std::string& backend_name, std::string* libname)
+{
+#ifdef _WIN32
+  *libname = "triton_" + backend_name + ".dll";
+#else
+  *libname = "libtriton_" + backend_name + ".so";
+#endif
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationModelLoadGpuFraction(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const int device_id, double* memory_limit)
+{
+  *memory_limit = 1.0;
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to find global backends directory configuration");
+  }
+
+  static std::string key_prefix = "model-load-gpu-limit-device-";
+  std::string memory_limit_str;
+  auto status = BackendConfiguration(
+      itr->second, key_prefix + std::to_string(device_id), &memory_limit_str);
+  // Allow missing key, default to 1.0 (no limit) if the limit is not specified
+  if (status.IsOk()) {
+    RETURN_IF_ERROR(BackendConfigurationParseStringToDouble(
+        memory_limit_str, memory_limit));
+  }
+
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_config.h
+++ b/3rdparty/core-r22.12/src/backend_config.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "status.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+/// Get a key's string value from a backend configuration.
+Status BackendConfiguration(
+    const triton::common::BackendCmdlineConfig& config, const std::string& key,
+    std::string* val);
+
+/// Convert a backend configuration string  value into a double.
+Status BackendConfigurationParseStringToDouble(
+    const std::string& str, double* val);
+
+/// Convert a backend configuration string  value into a bool.
+Status BackendConfigurationParseStringToBool(const std::string& str, bool* val);
+
+/// Get the global backends directory from the backend configuration.
+Status BackendConfigurationGlobalBackendsDirectory(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    std::string* dir);
+
+/// Get the minimum compute capability from the backend configuration.
+Status BackendConfigurationMinComputeCapability(
+    const triton::common::BackendCmdlineConfigMap& config_map, double* mcc);
+
+/// Get the model configuration auto-complete setting from the backend
+/// configuration.
+Status BackendConfigurationAutoCompleteConfig(
+    const triton::common::BackendCmdlineConfigMap& config_map, bool* acc);
+
+/// Convert a backend name to the specialized version of that name
+/// based on the backend configuration. For example, "tensorflow" will
+/// convert to either "tensorflow1" or "tensorflow2" depending on how
+/// tritonserver is run.
+Status BackendConfigurationSpecializeBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const std::string& backend_name, std::string* specialized_name);
+
+/// Return the shared library name for a backend.
+Status BackendConfigurationBackendLibraryName(
+    const std::string& backend_name, std::string* libname);
+
+/// Get GPU memory limit fraction for model loading
+/// from the backend configuration.
+Status BackendConfigurationModelLoadGpuFraction(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const int device_id, double* memory_limit);
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_manager.cc
+++ b/3rdparty/core-r22.12/src/backend_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_manager.h"
+
+#include "backend_memory_manager.h"
+#include "server_message.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+//
+// TritonBackend
+//
+Status
+TritonBackend::Create(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+    std::shared_ptr<TritonBackend>* backend)
+{
+  // Create the JSON representation of the backend configuration.
+  triton::common::TritonJson::Value backend_config_json(
+      triton::common::TritonJson::ValueType::OBJECT);
+  if (!backend_cmdline_config.empty()) {
+    triton::common::TritonJson::Value cmdline_json(
+        backend_config_json, triton::common::TritonJson::ValueType::OBJECT);
+    for (const auto& pr : backend_cmdline_config) {
+      RETURN_IF_ERROR(cmdline_json.AddString(pr.first.c_str(), pr.second));
+    }
+
+    RETURN_IF_ERROR(
+        backend_config_json.Add("cmdline", std::move(cmdline_json)));
+  }
+
+  TritonServerMessage backend_config(backend_config_json);
+
+  auto local_backend = std::shared_ptr<TritonBackend>(
+      new TritonBackend(name, dir, libpath, backend_config));
+
+  // Load the library and initialize all the entrypoints
+  RETURN_IF_ERROR(local_backend->LoadBackendLibrary());
+
+  // Backend initialization is optional... The TRITONBACKEND_Backend
+  // object is this TritonBackend object. We must set set shared
+  // library path to point to the backend directory in case the
+  // backend library attempts to load additional shared libaries.
+  if (local_backend->backend_init_fn_ != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(local_backend->dir_));
+
+    TRITONSERVER_Error* err = local_backend->backend_init_fn_(
+        reinterpret_cast<TRITONBACKEND_Backend*>(local_backend.get()));
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  local_backend->UpdateAttributes();
+
+  *backend = std::move(local_backend);
+  return Status::Success;
+}
+
+Status
+TritonBackend::UpdateAttributes()
+{
+  if (backend_attri_fn_ == nullptr) {
+    return Status::Success;
+  }
+
+  // Create an Attribute object for the backend to fill, note that it copies
+  // some fields from 'attributes_' while the others use default value. This
+  // is an ad hoc way to determine whether the attribute is set by the backend
+  // and keep / update current value.
+  Attribute latest;
+  latest.exec_policy_ = attributes_.exec_policy_;
+  RETURN_IF_TRITONSERVER_ERROR(backend_attri_fn_(
+      reinterpret_cast<TRITONBACKEND_Backend*>(this),
+      reinterpret_cast<TRITONBACKEND_BackendAttribute*>(&latest)));
+
+  // Update attributes that were set
+  attributes_.exec_policy_ = latest.exec_policy_;
+  if (!latest.preferred_groups_.empty()) {
+    attributes_.preferred_groups_ = latest.preferred_groups_;
+  }
+  return Status::Success;
+}
+
+TritonBackend::TritonBackend(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const TritonServerMessage& backend_config)
+    : name_(name), dir_(dir), libpath_(libpath),
+      backend_config_(backend_config), state_(nullptr)
+{
+  ClearHandles();
+}
+
+TritonBackend::~TritonBackend()
+{
+  LOG_VERBOSE(1) << "unloading backend '" << name_ << "'";
+
+  // Backend finalization is optional... The TRITONBACKEND_Backend
+  // object is this TritonBackend object.
+  if (backend_fini_fn_ != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        backend_fini_fn_(reinterpret_cast<TRITONBACKEND_Backend*>(this)),
+        "failed finalizing backend");
+  }
+
+  ClearHandles();
+}
+
+void
+TritonBackend::ClearHandles()
+{
+  dlhandle_ = nullptr;
+  backend_init_fn_ = nullptr;
+  backend_fini_fn_ = nullptr;
+  backend_attri_fn_ = nullptr;
+  model_init_fn_ = nullptr;
+  model_fini_fn_ = nullptr;
+  inst_init_fn_ = nullptr;
+  inst_fini_fn_ = nullptr;
+  inst_exec_fn_ = nullptr;
+}
+
+Status
+TritonBackend::LoadBackendLibrary()
+{
+  TritonBackendInitFn_t bifn;
+  TritonBackendFiniFn_t bffn;
+  TritonBackendAttriFn_t bafn;
+  TritonModelInitFn_t mifn;
+  TritonModelFiniFn_t mffn;
+  TritonModelInstanceInitFn_t iifn;
+  TritonModelInstanceFiniFn_t iffn;
+  TritonModelInstanceExecFn_t iefn;
+
+  {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+
+    RETURN_IF_ERROR(slib->OpenLibraryHandle(libpath_, &dlhandle_));
+
+    // Backend initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_Initialize", true /* optional */,
+        reinterpret_cast<void**>(&bifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_Finalize", true /* optional */,
+        reinterpret_cast<void**>(&bffn)));
+    // Backend attribute function, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_GetBackendAttribute", true /* optional */,
+        reinterpret_cast<void**>(&bafn)));
+
+    // Model initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInitialize", true /* optional */,
+        reinterpret_cast<void**>(&mifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelFinalize", true /* optional */,
+        reinterpret_cast<void**>(&mffn)));
+
+    // Model instance initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceInitialize", true /* optional */,
+        reinterpret_cast<void**>(&iifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceFinalize", true /* optional */,
+        reinterpret_cast<void**>(&iffn)));
+
+    // Model instance execute function, required
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceExecute", false /* optional */,
+        reinterpret_cast<void**>(&iefn)));
+  }
+
+  backend_init_fn_ = bifn;
+  backend_fini_fn_ = bffn;
+  backend_attri_fn_ = bafn;
+  model_init_fn_ = mifn;
+  model_fini_fn_ = mffn;
+  inst_init_fn_ = iifn;
+  inst_fini_fn_ = iffn;
+  inst_exec_fn_ = iefn;
+
+  return Status::Success;
+}
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ApiVersion(uint32_t* major, uint32_t* minor)
+{
+  *major = TRITONBACKEND_API_VERSION_MAJOR;
+  *minor = TRITONBACKEND_API_VERSION_MINOR;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendName(TRITONBACKEND_Backend* backend, const char** name)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *name = tb->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendConfig(
+    TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *backend_config = const_cast<TRITONSERVER_Message*>(
+      reinterpret_cast<const TRITONSERVER_Message*>(&tb->BackendConfig()));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *policy = tb->ExecutionPolicy();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendSetExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  tb->SetExecutionPolicy(policy);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendArtifacts(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
+  *location = tb->Directory().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendMemoryManager(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager)
+{
+  static TritonMemoryManager gMemoryManager;
+  *manager = reinterpret_cast<TRITONBACKEND_MemoryManager*>(&gMemoryManager);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendState(TRITONBACKEND_Backend* backend, void** state)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *state = tb->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendSetState(TRITONBACKEND_Backend* backend, void* state)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  tb->SetState(state);
+  return nullptr;  // success
+}
+
+}  // extern C
+
+//
+// TritonBackendManager
+//
+
+static std::weak_ptr<TritonBackendManager> backend_manager_;
+static std::mutex mu_;
+
+Status
+TritonBackendManager::Create(std::shared_ptr<TritonBackendManager>* manager)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  // If there is already a manager then we just use it...
+  *manager = backend_manager_.lock();
+  if (*manager != nullptr) {
+    return Status::Success;
+  }
+
+  manager->reset(new TritonBackendManager());
+  backend_manager_ = *manager;
+
+  return Status::Success;
+}
+
+Status
+TritonBackendManager::CreateBackend(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+    std::shared_ptr<TritonBackend>* backend)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  const auto& itr = backend_map_.find(libpath);
+  if (itr != backend_map_.end()) {
+    *backend = itr->second;
+    return Status::Success;
+  }
+
+  RETURN_IF_ERROR(TritonBackend::Create(
+      name, dir, libpath, backend_cmdline_config, backend));
+  backend_map_.insert({libpath, *backend});
+
+  return Status::Success;
+}
+
+Status
+TritonBackendManager::BackendState(
+    std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>*
+        backend_state)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>
+      backend_state_map(
+          new std::unordered_map<std::string, std::vector<std::string>>);
+  for (const auto& backend_pair : backend_map_) {
+    auto& libpath = backend_pair.first;
+    auto backend = backend_pair.second;
+
+    const char* backend_config;
+    size_t backend_config_size;
+    backend->BackendConfig().Serialize(&backend_config, &backend_config_size);
+    backend_state_map->insert(
+        {backend->Name(), std::vector<std::string>{libpath, backend_config}});
+  }
+
+  *backend_state = std::move(backend_state_map);
+
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_manager.h
+++ b/3rdparty/core-r22.12/src/backend_manager.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include "constants.h"
+#include "server_message.h"
+#include "status.h"
+#include "triton/common/model_config.h"
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+//
+// Proxy to a backend shared library.
+//
+class TritonBackend {
+ public:
+  struct Attribute {
+    Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
+    TRITONBACKEND_ExecutionPolicy exec_policy_;
+    std::vector<inference::ModelInstanceGroup> preferred_groups_;
+  };
+  typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
+      TRITONBACKEND_Model* model);
+  typedef TRITONSERVER_Error* (*TritonModelFiniFn_t)(
+      TRITONBACKEND_Model* model);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceInitFn_t)(
+      TRITONBACKEND_ModelInstance* instance);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceFiniFn_t)(
+      TRITONBACKEND_ModelInstance* instance);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceExecFn_t)(
+      TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+      const uint32_t request_cnt);
+
+  static Status Create(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath,
+      const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+      std::shared_ptr<TritonBackend>* backend);
+  ~TritonBackend();
+
+  const std::string& Name() const { return name_; }
+  const std::string& Directory() const { return dir_; }
+  const TritonServerMessage& BackendConfig() const { return backend_config_; }
+  const Attribute& BackendAttributes() const { return attributes_; }
+
+  TRITONBACKEND_ExecutionPolicy ExecutionPolicy() const
+  {
+    return attributes_.exec_policy_;
+  }
+  void SetExecutionPolicy(const TRITONBACKEND_ExecutionPolicy policy)
+  {
+    attributes_.exec_policy_ = policy;
+  }
+
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  TritonModelInitFn_t ModelInitFn() const { return model_init_fn_; }
+  TritonModelFiniFn_t ModelFiniFn() const { return model_fini_fn_; }
+  TritonModelInstanceInitFn_t ModelInstanceInitFn() const
+  {
+    return inst_init_fn_;
+  }
+  TritonModelInstanceFiniFn_t ModelInstanceFiniFn() const
+  {
+    return inst_fini_fn_;
+  }
+  TritonModelInstanceExecFn_t ModelInstanceExecFn() const
+  {
+    return inst_exec_fn_;
+  }
+
+ private:
+  typedef TRITONSERVER_Error* (*TritonBackendInitFn_t)(
+      TRITONBACKEND_Backend* backend);
+  typedef TRITONSERVER_Error* (*TritonBackendFiniFn_t)(
+      TRITONBACKEND_Backend* backend);
+  typedef TRITONSERVER_Error* (*TritonBackendAttriFn_t)(
+      TRITONBACKEND_Backend* backend,
+      TRITONBACKEND_BackendAttribute* backend_attributes);
+
+  TritonBackend(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath, const TritonServerMessage& backend_config);
+
+  void ClearHandles();
+  Status LoadBackendLibrary();
+
+  Status UpdateAttributes();
+
+  // The name of the backend.
+  const std::string name_;
+
+  // Full path to the directory holding backend shared library and
+  // other artifacts.
+  const std::string dir_;
+
+  // Full path to the backend shared library.
+  const std::string libpath_;
+
+  // Backend configuration as JSON
+  TritonServerMessage backend_config_;
+
+  // backend attributes
+  Attribute attributes_;
+
+  // dlopen / dlsym handles
+  void* dlhandle_;
+  TritonBackendInitFn_t backend_init_fn_;
+  TritonBackendFiniFn_t backend_fini_fn_;
+  TritonBackendAttriFn_t backend_attri_fn_;
+  TritonModelInitFn_t model_init_fn_;
+  TritonModelFiniFn_t model_fini_fn_;
+  TritonModelInstanceInitFn_t inst_init_fn_;
+  TritonModelInstanceFiniFn_t inst_fini_fn_;
+  TritonModelInstanceExecFn_t inst_exec_fn_;
+
+  // Opaque state associated with the backend.
+  void* state_;
+};
+
+//
+// Manage communication with Triton backends and their lifecycle.
+//
+class TritonBackendManager {
+ public:
+  static Status Create(std::shared_ptr<TritonBackendManager>* manager);
+
+  Status CreateBackend(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath,
+      const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+      std::shared_ptr<TritonBackend>* backend);
+
+  Status BackendState(
+      std::unique_ptr<
+          std::unordered_map<std::string, std::vector<std::string>>>*
+          backend_state);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonBackendManager);
+  TritonBackendManager() = default;
+  std::unordered_map<std::string, std::shared_ptr<TritonBackend>> backend_map_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_memory_manager.cc
+++ b/3rdparty/core-r22.12/src/backend_memory_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_memory_manager.h"
+
+#include "pinned_memory_manager.h"
+#include "status.h"
+#include "tritonserver_apis.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#include "cuda_memory_manager.h"
+#endif  // TRITON_ENABLE_GPU
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_MemoryManagerAllocate(
+    TRITONBACKEND_MemoryManager* manager, void** buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
+    const uint64_t byte_size)
+{
+  switch (memory_type) {
+    case TRITONSERVER_MEMORY_GPU:
+#ifdef TRITON_ENABLE_GPU
+    {
+      auto status = CudaMemoryManager::Alloc(buffer, byte_size, memory_type_id);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.ErrorCode()),
+            status.Message().c_str());
+      }
+      break;
+    }
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "GPU memory allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+
+    case TRITONSERVER_MEMORY_CPU_PINNED:
+#ifdef TRITON_ENABLE_GPU
+    {
+      TRITONSERVER_MemoryType mt = memory_type;
+      auto status = PinnedMemoryManager::Alloc(buffer, byte_size, &mt, false);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.ErrorCode()),
+            status.Message().c_str());
+      }
+      break;
+    }
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "Pinned memory allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+
+    case TRITONSERVER_MEMORY_CPU: {
+      *buffer = malloc(byte_size);
+      if (*buffer == nullptr) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNAVAILABLE, "CPU memory allocation failed");
+      }
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_MemoryManagerFree(
+    TRITONBACKEND_MemoryManager* manager, void* buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+{
+  switch (memory_type) {
+    case TRITONSERVER_MEMORY_GPU: {
+#ifdef TRITON_ENABLE_GPU
+      auto status = CudaMemoryManager::Free(buffer, memory_type_id);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.StatusCode()),
+            status.Message().c_str());
+      }
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case TRITONSERVER_MEMORY_CPU_PINNED: {
+#ifdef TRITON_ENABLE_GPU
+      auto status = PinnedMemoryManager::Free(buffer);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.StatusCode()),
+            status.Message().c_str());
+      }
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case TRITONSERVER_MEMORY_CPU:
+      free(buffer);
+      break;
+  }
+
+  return nullptr;  // success
+}
+
+}  // extern C
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_memory_manager.h
+++ b/3rdparty/core-r22.12/src/backend_memory_manager.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+namespace triton { namespace core {
+
+// Currently there is just a global memory manager that is used for
+// all backends and which simply forwards requests on to the core
+// memory manager.
+struct TritonMemoryManager {
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model.cc
+++ b/3rdparty/core-r22.12/src/backend_model.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_model.h"
+
+#include <vector>
+#include "backend_config.h"
+#include "backend_model_instance.h"
+#include "dynamic_batch_scheduler.h"
+#include "filesystem.h"
+#include "model_config_utils.h"
+#include "numa_utils.h"
+#include "sequence_batch_scheduler.h"
+#include "sequence_state.h"
+#include "server.h"
+#include "server_message.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+#include "tritonserver_apis.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+Status
+TritonModel::Create(
+    InferenceServer* server, const std::string& model_path,
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+    const std::string& model_name, const int64_t version,
+    inference::ModelConfig model_config, const bool is_config_provided,
+    std::unique_ptr<TritonModel>* model)
+{
+  model->reset();
+
+  // The model configuration must specify a backend. The name of the
+  // corresponding shared library must be libtriton_<backend>.so.
+  if (model_config.backend().empty()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify 'backend' for '" + model_config.name() + "'");
+  }
+
+  // Localize the content of the model repository corresponding to
+  // 'model_name'. This model holds a handle to the localized content
+  // so that it persists as long as the model is loaded.
+  std::shared_ptr<LocalizedPath> localized_model_dir;
+  RETURN_IF_ERROR(LocalizePath(model_path, &localized_model_dir));
+
+  // Localize paths in backend model config
+  // [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
+  RETURN_IF_ERROR(LocalizePythonBackendExecutionEnvironmentPath(
+      model_path, &model_config, &localized_model_dir));
+
+  // Get some internal configuration values needed for initialization.
+  std::string backend_dir;
+  RETURN_IF_ERROR(BackendConfigurationGlobalBackendsDirectory(
+      backend_cmdline_config_map, &backend_dir));
+
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(BackendConfigurationAutoCompleteConfig(
+      backend_cmdline_config_map, &auto_complete_config));
+
+  double min_compute_capability = 0;
+  RETURN_IF_ERROR(BackendConfigurationMinComputeCapability(
+      backend_cmdline_config_map, &min_compute_capability));
+
+  std::string specialized_backend_name;
+  RETURN_IF_ERROR(BackendConfigurationSpecializeBackendName(
+      backend_cmdline_config_map, model_config.backend(),
+      &specialized_backend_name));
+
+  std::string backend_libname;
+  RETURN_IF_ERROR(BackendConfigurationBackendLibraryName(
+      specialized_backend_name, &backend_libname));
+
+  // Get the path to the backend shared library. Search path is
+  // version directory, model directory, global backend directory.
+  const auto localized_model_path = localized_model_dir->Path();
+  const auto version_path =
+      JoinPath({localized_model_path, std::to_string(version)});
+  const std::string global_path =
+      JoinPath({backend_dir, specialized_backend_name});
+  const std::vector<std::string> search_paths = {
+      version_path, localized_model_path, global_path};
+
+  std::string backend_libdir;
+  std::string backend_libpath;
+  for (const auto& path : search_paths) {
+    const auto full_path = JoinPath({path, backend_libname});
+    bool exists = false;
+    RETURN_IF_ERROR(FileExists(full_path, &exists));
+    if (exists) {
+      backend_libdir = path;
+      backend_libpath = full_path;
+      break;
+    }
+  }
+
+  if (backend_libpath.empty()) {
+    return Status(
+        Status::Code::INVALID_ARG, "unable to find '" + backend_libname +
+                                       "' for model '" + model_config.name() +
+                                       "', searched: " + version_path + ", " +
+                                       model_path + ", " + global_path);
+  }
+
+  // Resolve the global backend configuration with the specific backend
+  // configuration
+  triton::common::BackendCmdlineConfig config;
+  RETURN_IF_ERROR(ResolveBackendConfigs(
+      backend_cmdline_config_map, model_config.backend(), config));
+
+  RETURN_IF_ERROR(SetBackendConfigDefaults(config));
+
+  std::shared_ptr<TritonBackend> backend;
+  RETURN_IF_ERROR(server->BackendManager()->CreateBackend(
+      model_config.backend(), backend_libdir, backend_libpath, config,
+      &backend));
+
+  // Normalize backend-dependent config
+  {
+    const auto& attributes = backend->BackendAttributes();
+    // [WIP] formalize config normalization / validation
+    RETURN_IF_ERROR(NormalizeInstanceGroup(
+        min_compute_capability, attributes.preferred_groups_, &model_config));
+    RETURN_IF_ERROR(
+        ValidateInstanceGroup(model_config, min_compute_capability));
+  }
+
+  // Create and initialize the model.
+  std::unique_ptr<TritonModel> local_model(new TritonModel(
+      server, localized_model_dir, backend, min_compute_capability, version,
+      model_config, auto_complete_config));
+
+  TritonModel* raw_local_model = local_model.get();
+
+  // Model initialization is optional... The TRITONBACKEND_Model
+  // object is this TritonModel object. We must set set shared library
+  // path to point to the backend directory in case the backend
+  // library attempts to load additional shared libaries.
+  if (backend->ModelInitFn() != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(backend->Directory()));
+
+    TRITONSERVER_Error* err = backend->ModelInitFn()(
+        reinterpret_cast<TRITONBACKEND_Model*>(raw_local_model));
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  // Initialize the model for Triton core usage
+  RETURN_IF_ERROR(local_model->Init(is_config_provided));
+
+  bool device_blocking = false;
+  if (local_model->backend_->ExecutionPolicy() ==
+      TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
+    if (model_config.has_sequence_batching()) {
+      LOG_INFO << "Overriding execution policy to "
+                  "\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
+               << model_config.name() << "\"";
+    } else {
+      device_blocking = true;
+    }
+  }
+
+  // Create and initialize the model instances for this model.
+  RETURN_IF_ERROR(TritonModelInstance::CreateInstances(
+      raw_local_model, backend_cmdline_config_map, host_policy_map,
+      model_config, device_blocking));
+
+  RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
+
+  *model = std::move(local_model);
+  return Status::Success;
+}
+
+Status
+TritonModel::ResolveBackendConfigs(
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const std::string& backend_name,
+    triton::common::BackendCmdlineConfig& config)
+{
+  const auto& global_itr = backend_cmdline_config_map.find(std::string());
+  const auto& specific_itr = backend_cmdline_config_map.find(backend_name);
+  if (specific_itr == backend_cmdline_config_map.end() &&
+      global_itr != backend_cmdline_config_map.end()) {
+    for (auto setting : global_itr->second) {
+      config.push_back(setting);
+    }
+  } else if (
+      specific_itr != backend_cmdline_config_map.end() &&
+      global_itr == backend_cmdline_config_map.end()) {
+    for (auto setting : specific_itr->second) {
+      config.push_back(setting);
+    }
+  } else if (
+      specific_itr != backend_cmdline_config_map.end() &&
+      global_itr != backend_cmdline_config_map.end()) {
+    triton::common::BackendCmdlineConfig global_backend_config =
+        global_itr->second;
+    triton::common::BackendCmdlineConfig specific_backend_config =
+        specific_itr->second;
+
+    std::sort(global_backend_config.begin(), global_backend_config.end());
+    std::sort(specific_backend_config.begin(), specific_backend_config.end());
+
+    size_t global_index = 0;
+    size_t specific_index = 0;
+    while (global_index < global_backend_config.size() &&
+           specific_index < specific_backend_config.size()) {
+      auto& current_global_setting = global_backend_config.at(global_index);
+      auto& current_specific_setting =
+          specific_backend_config.at(specific_index);
+      if (current_specific_setting.first.compare(
+              current_global_setting.first) == 0) {
+        // specific setting overrides global setting
+        config.push_back(current_specific_setting);
+        ++global_index;
+        ++specific_index;
+      } else if (
+          current_specific_setting.first.compare(current_global_setting.first) <
+          0) {
+        config.push_back(current_specific_setting);
+        ++specific_index;
+      } else {
+        config.push_back(current_global_setting);
+        ++global_index;
+      }
+    }
+
+    // add the rest of the global configs
+    if (global_index < global_backend_config.size()) {
+      auto& current_global_setting = global_backend_config.at(global_index);
+      config.push_back(current_global_setting);
+    }
+
+    // add the rest of the specific settings
+    if (specific_index < specific_backend_config.size()) {
+      auto& current_specific_setting =
+          specific_backend_config.at(specific_index);
+      config.push_back(current_specific_setting);
+    }
+  }  // else empty config
+
+  return Status::Success;
+}
+
+
+const std::unordered_map<std::string, std::string> backend_config_defaults(
+    {{"default-max-batch-size", "4"}});
+
+Status
+TritonModel::SetBackendConfigDefaults(
+    triton::common::BackendCmdlineConfig& config)
+{
+  auto backend_config_defaults_copy = backend_config_defaults;
+
+  for (auto& setting : config) {
+    if (setting.first.compare("default-max-batch-size") == 0) {
+      LOG_VERBOSE(1) << "Found overwritten default setting: " << setting.first
+                     << "," << setting.second;
+      backend_config_defaults_copy.erase(setting.first);
+    }
+
+    if (backend_config_defaults_copy.empty()) {
+      break;
+    }
+  }
+
+  // Anything left should be added to the config
+  for (const auto& default_setting : backend_config_defaults_copy) {
+    LOG_VERBOSE(1) << "Adding default backend config setting: "
+                   << default_setting.first << "," << default_setting.second;
+    config.push_back(
+        std::make_pair(default_setting.first, default_setting.second));
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::AddInstance(
+    std::unique_ptr<TritonModelInstance>&& instance, const bool passive)
+{
+  if (passive) {
+    passive_instances_.emplace_back(std::move(instance));
+  } else {
+    instances_.emplace_back(std::move(instance));
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::UpdateModelConfig(
+    const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
+{
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_TRITONSERVER_ERROR(TRITONSERVER_MessageSerializeToJson(
+      updated_config_message, &buffer, &byte_size));
+  inference::ModelConfig updated_config;
+  RETURN_IF_ERROR(
+      JsonToModelConfig({buffer, byte_size}, config_version, &updated_config));
+  auto config = Config();
+  config.set_max_batch_size(updated_config.max_batch_size());
+
+  auto inputs_config = config.mutable_input();
+  *inputs_config = updated_config.input();
+  auto outputs_config = config.mutable_output();
+  *outputs_config = updated_config.output();
+
+  if (!config.scheduling_choice_case()) {
+    if (updated_config.has_dynamic_batching()) {
+      auto dynamic_batching_config = config.mutable_dynamic_batching();
+      *dynamic_batching_config = updated_config.dynamic_batching();
+    } else if (updated_config.has_sequence_batching()) {
+      auto sequence_batching_config = config.mutable_sequence_batching();
+      *sequence_batching_config = updated_config.sequence_batching();
+    } else if (updated_config.has_ensemble_scheduling()) {
+      auto ensemble_scheduling_config = config.mutable_ensemble_scheduling();
+      *ensemble_scheduling_config = updated_config.ensemble_scheduling();
+    }  // else do nothing
+  } else if (
+      config.scheduling_choice_case() !=
+      updated_config.scheduling_choice_case()) {
+    return Status(
+        triton::common::Error::Code::INTERNAL,
+        (std::string("Cannot update scheduling choice from ") +
+         std::to_string(config.scheduling_choice_case()) + std::string(" to ") +
+         std::to_string(config.scheduling_choice_case()) +
+         std::string(" when auto-completing."))
+            .c_str());
+  }  // else do nothing
+
+  // Need to normalize the model configuration for
+  // populating missing fields.
+  RETURN_IF_ERROR(NormalizeModelConfig(min_compute_capability_, &config));
+
+  RETURN_IF_ERROR(SetModelConfig(config));
+
+  return Status::Success;
+}
+
+Status
+TritonModel::SetConfiguredScheduler()
+{
+  std::unique_ptr<Scheduler> scheduler;
+
+  // Need to enforce equal shape batches (i.e. non-ragged batches) if
+  // the model 1) allows one or more variable-size input tensors that
+  // are not marked as 'allow_ragged_batch' or 2) has one or more
+  // shape-tensor inputs. This is not needed if all input shapes are
+  // non-variable and if there are no shape tensors... so we don't
+  // enable it in that case for efficiency reasons.
+  std::unordered_map<std::string, bool> enforce_equal_shape_tensors;
+  for (const auto input : config_.input()) {
+    if (input.is_shape_tensor()) {
+      enforce_equal_shape_tensors.insert({input.name(), true});
+    } else if (
+        !input.allow_ragged_batch() &&
+        (triton::common::GetElementCount(input) == -1)) {
+      enforce_equal_shape_tensors.insert({input.name(), false});
+    }
+  }
+
+  // If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
+  // otherwise use the default DynamicBatchScheduler.
+  if (config_.has_sequence_batching()) {
+    // Sequence batcher
+    RETURN_IF_ERROR(SequenceBatchScheduler::Create(
+        this, enforce_equal_shape_tensors, &scheduler));
+  } else if (config_.has_dynamic_batching()) {
+    // Dynamic batcher
+    RETURN_IF_ERROR(DynamicBatchScheduler::Create(
+        this, nullptr, 0 /*nice*/, true /* dynamic_batching_enabled */,
+        config_.max_batch_size(), enforce_equal_shape_tensors,
+        config_.dynamic_batching(),
+        config_.response_cache().enable() /* response_cache_enable */,
+        &scheduler));
+  } else {
+    // Default scheduler. Use dynamic batch scheduler (with batching
+    // disabled) as the default scheduler.
+    RETURN_IF_ERROR(DynamicBatchScheduler::Create(
+        this, nullptr, 0 /*nice*/, false /* dynamic_batching_enabled */,
+        1 /* max_batch_size */,
+        std::unordered_map<
+            std::string, bool>() /* enforce_equal_shape_tensors */,
+        false /* preserve_ordering */,
+        config_.response_cache().enable() /* response_cache_enable */,
+        std::set<int32_t>() /* preferred_batch_sizes */,
+        0 /* max_queue_delay_microseconds */, &scheduler));
+  }
+
+  return SetScheduler(std::move(scheduler));
+}
+
+Status
+TritonModel::Initialize()
+{
+  for (const auto& instance : instances_) {
+    RETURN_IF_ERROR(instance->Initialize());
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::WarmUp()
+{
+  for (const auto& instance : instances_) {
+    RETURN_IF_ERROR(instance->WarmUp());
+  }
+
+  return Status::Success;
+}
+
+TritonModel::TritonModel(
+    InferenceServer* server,
+    const std::shared_ptr<LocalizedPath>& localized_model_dir,
+    const std::shared_ptr<TritonBackend>& backend,
+    const double min_compute_capability, const int64_t version,
+    const inference::ModelConfig& config, const bool auto_complete_config)
+    : Model(
+          min_compute_capability, localized_model_dir->Path(), version, config),
+      server_(server), min_compute_capability_(min_compute_capability),
+      auto_complete_config_(auto_complete_config),
+      localized_model_dir_(localized_model_dir), backend_(backend),
+      state_(nullptr)
+{
+}
+
+TritonModel::~TritonModel()
+{
+  // Explicitly delete/finalize all model instances before finalizing
+  // the model itself.
+  instances_.clear();
+  passive_instances_.clear();
+
+  // Unregister itself from the rate limiter. Note this should happen
+  // after all instances are destructed. Destrucing instances ensures
+  // there are no instance threads waiting on rate limiter for
+  // receiving their payloads.
+  server_->GetRateLimiter()->UnregisterModel(this);
+
+  // Model finalization is optional... The TRITONBACKEND_Model
+  // object is this TritonModel object.
+  if (backend_->ModelFiniFn() != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        backend_->ModelFiniFn()(reinterpret_cast<TRITONBACKEND_Model*>(this)),
+        "failed finalizing model");
+  }
+}
+
+extern "C" {
+
+//
+// TRITONBACKEND_Model
+//
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelName(TRITONBACKEND_Model* model, const char** name)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *name = tm->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelVersion(TRITONBACKEND_Model* model, uint64_t* version)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *version = tm->Version();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelRepository(
+    TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
+  *location = tm->LocalizedModelPath().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message** model_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+
+  std::string model_config_json;
+  Status status =
+      ModelConfigToJson(tm->Config(), config_version, &model_config_json);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *model_config = reinterpret_cast<TRITONSERVER_Message*>(
+      new TritonServerMessage(std::move(model_config_json)));
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelAutoCompleteConfig(
+    TRITONBACKEND_Model* model, bool* auto_complete_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *auto_complete_config = tm->AutoCompleteConfig();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelSetConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message* model_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  Status status = tm->UpdateModelConfig(config_version, model_config);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelServer(
+    TRITONBACKEND_Model* model, TRITONSERVER_Server** server)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *server = reinterpret_cast<TRITONSERVER_Server*>(tm->Server());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelBackend(
+    TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *backend = reinterpret_cast<TRITONBACKEND_Backend*>(tm->Backend().get());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelState(TRITONBACKEND_Model* model, void** state)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *state = tm->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelSetState(TRITONBACKEND_Model* model, void* state)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  tm->SetState(state);
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Request
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestId(TRITONBACKEND_Request* request, const char** id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *id = tr->Id().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestCorrelationId(TRITONBACKEND_Request* request, uint64_t* id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
+  if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::UINT64) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "correlation ID in request is not an unsigned int")
+            .c_str());
+  }
+  *id = correlation_id.UnsignedIntValue();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestFlags(TRITONBACKEND_Request* request, uint32_t* flags)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *flags = tr->Flags();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestCorrelationIdString(
+    TRITONBACKEND_Request* request, const char** id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
+  if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::STRING) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "correlation ID in request is not a string")
+            .c_str());
+  }
+  *id = correlation_id.StringValue().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputCount(TRITONBACKEND_Request* request, uint32_t* count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *count = tr->ImmutableInputs().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** input_name)
+{
+  *input_name = nullptr;
+
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  if (index >= inputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(inputs.size()) + " inputs")
+            .c_str());
+  }
+
+  // The request inputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // map. This linear search is the best we can do given the need for
+  // the inputs to be in a map and given the typical small number of
+  // inputs is better than having every request maintain the inputs as
+  // both map and vector.
+  uint32_t cnt = 0;
+  for (const auto& pr : inputs) {
+    if (cnt++ == index) {
+      InferenceRequest::Input* in = pr.second;
+      *input_name = in->Name().c_str();
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInput(
+    TRITONBACKEND_Request* request, const char* name,
+    TRITONBACKEND_Input** input)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  const auto& itr = inputs.find(name);
+  if (itr == inputs.end()) {
+    *input = nullptr;
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "unknown request input name " + name).c_str());
+  }
+
+  InferenceRequest::Input* in = itr->second;
+  *input = reinterpret_cast<TRITONBACKEND_Input*>(in);
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputByIndex(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    TRITONBACKEND_Input** input)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  if (index >= inputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(inputs.size()) + " inputs")
+            .c_str());
+  }
+
+  // The request inputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // map. This linear search is the best we can do given the need for
+  // the inputs to be in a map and given the typical small number of
+  // inputs is better than having every request maintain the inputs as
+  // both map and vector.
+  uint32_t cnt = 0;
+  for (const auto& pr : inputs) {
+    if (cnt++ == index) {
+      InferenceRequest::Input* in = pr.second;
+      *input = reinterpret_cast<TRITONBACKEND_Input*>(in);
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputCount(
+    TRITONBACKEND_Request* request, uint32_t* count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *count = tr->ImmutableRequestedOutputs().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** output_name)
+{
+  *output_name = nullptr;
+
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& routputs = tr->ImmutableRequestedOutputs();
+  if (index >= routputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(routputs.size()) +
+         " requested outputs")
+            .c_str());
+  }
+
+  // The requested outputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // set. This linear search is the best we can do given the requested
+  // outputs being in a set and given the typical small number of
+  // requested outputs it should not be a performance issue.
+  uint32_t cnt = 0;
+  for (const auto& rout : routputs) {
+    if (cnt++ == index) {
+      *output_name = rout.c_str();
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputBufferProperties(
+    TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  auto status =
+      tr->OutputBufferProperties(name, byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestRelease(
+    TRITONBACKEND_Request* request, uint32_t release_flags)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  std::unique_ptr<InferenceRequest> ur(tr);
+  InferenceRequest::Release(std::move(ur), release_flags);
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_State
+///
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateUpdate(TRITONBACKEND_State* state)
+{
+  SequenceState* ts = reinterpret_cast<SequenceState*>(state);
+  auto status = ts->Update();
+
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateNew(
+    TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  SequenceState* lstate;
+  std::vector<int64_t> lshape(shape, shape + dims_count);
+  auto& sequence_state = tr->GetSequenceStates();
+
+  if (sequence_state == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("unable to add state '") + name +
+         "'. State configuration is missing for model '" + tr->ModelName() +
+         "'.")
+            .c_str());
+  }
+
+  Status status = sequence_state->OutputState(
+      name, TritonToDataType(datatype), lshape, &lstate);
+  if (!status.IsOk()) {
+    *state = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *state = reinterpret_cast<TRITONBACKEND_State*>(lstate);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateBuffer(
+    TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  SequenceState* to = reinterpret_cast<SequenceState*>(state);
+  Status status = Status::Success;
+
+  // If the buffer size exactly matches the buffer available, reuse the
+  // currently allocated buffer.
+  if (to->Data()->TotalByteSize() == buffer_byte_size) {
+    const std::shared_ptr<AllocatedMemory>& memory =
+        reinterpret_cast<const std::shared_ptr<AllocatedMemory>&>(to->Data());
+
+    TRITONSERVER_MemoryType current_memory_type;
+    int64_t current_memory_type_id;
+    void* lbuffer =
+        memory->MutableBuffer(&current_memory_type, &current_memory_type_id);
+
+    // If the requested memory type doesn't match the current buffer, allocate a
+    // new buffer with the requested memory type and memory type id.
+    if (current_memory_type == *memory_type &&
+        current_memory_type_id == *memory_type_id) {
+      *buffer = lbuffer;
+    } else {
+      std::shared_ptr<AllocatedMemory> memory =
+          std::make_shared<AllocatedMemory>(
+              buffer_byte_size, *memory_type, *memory_type_id);
+      *buffer = memory->MutableBuffer(memory_type, memory_type_id);
+      to->RemoveAllData();
+      status = to->SetData(memory);
+    }
+  } else {
+    std::shared_ptr<AllocatedMemory> memory = std::make_shared<AllocatedMemory>(
+        buffer_byte_size, *memory_type, *memory_type_id);
+    *buffer = memory->MutableBuffer(memory_type, memory_type_id);
+    to->RemoveAllData();
+    status = to->SetData(memory);
+  }
+
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateBufferAttributes(
+    TRITONBACKEND_State* state,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  SequenceState* to = reinterpret_cast<SequenceState*>(state);
+  to->Data()->BufferAt(
+      0, reinterpret_cast<BufferAttributes**>(buffer_attributes));
+
+  return nullptr;  // success
+}
+
+//
+// TRITONBACKEND_ResponseFactory
+//
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactoryNew(
+    TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      new std::shared_ptr<InferenceResponseFactory>(tr->ResponseFactory());
+
+  *factory = reinterpret_cast<TRITONBACKEND_ResponseFactory*>(response_factory);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactoryDelete(TRITONBACKEND_ResponseFactory* factory)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+  delete response_factory;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactorySendFlags(
+    TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+  Status status = (*response_factory)->SendFlags(send_flags);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Response
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseNew(
+    TRITONBACKEND_Response** response, TRITONBACKEND_Request* request)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+
+  std::unique_ptr<InferenceResponse> tresp;
+  Status status = tr->ResponseFactory()->CreateResponse(&tresp);
+  if (!status.IsOk()) {
+    *response = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *response = reinterpret_cast<TRITONBACKEND_Response*>(tresp.release());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseNewFromFactory(
+    TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+
+  std::unique_ptr<InferenceResponse> tr;
+  Status status = (*response_factory)->CreateResponse(&tr);
+  if (!status.IsOk()) {
+    *response = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *response = reinterpret_cast<TRITONBACKEND_Response*>(tr.release());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseDelete(TRITONBACKEND_Response* response)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  delete tr;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetStringParameter(
+    TRITONBACKEND_Response* response, const char* name, const char* value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetIntParameter(
+    TRITONBACKEND_Response* response, const char* name, const int64_t value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetBoolParameter(
+    TRITONBACKEND_Response* response, const char* name, const bool value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseOutput(
+    TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  std::vector<int64_t> lshape(shape, shape + dims_count);
+  InferenceResponse::Output* loutput;
+  Status status = tr->AddOutput(
+      name, TritonToDataType(datatype), std::move(lshape), &loutput);
+  if (!status.IsOk()) {
+    *output = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *output = reinterpret_cast<TRITONBACKEND_Output*>(loutput);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSend(
+    TRITONBACKEND_Response* response, const uint32_t send_flags,
+    TRITONSERVER_Error* error)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+
+  Status status;
+
+  std::unique_ptr<InferenceResponse> utr(tr);
+  if (error == nullptr) {
+    status = InferenceResponse::Send(std::move(utr), send_flags);
+  } else {
+    status = InferenceResponse::SendWithStatus(
+        std::move(utr), send_flags,
+        Status(
+            TritonCodeToStatusCode(TRITONSERVER_ErrorCode(error)),
+            TRITONSERVER_ErrorMessage(error)));
+  }
+
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Input
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputProperties(
+    TRITONBACKEND_Input* input, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  if (name != nullptr) {
+    *name = ti->Name().c_str();
+  }
+  if (datatype != nullptr) {
+    *datatype = DataTypeToTriton(ti->DType());
+  }
+  if (shape != nullptr) {
+    *shape = ti->ShapeWithBatchDim().data();
+  }
+  if (dims_count != nullptr) {
+    *dims_count = ti->ShapeWithBatchDim().size();
+  }
+  if (byte_size != nullptr) {
+    *byte_size = ti->Data()->TotalByteSize();
+  }
+  if (buffer_count != nullptr) {
+    *buffer_count = ti->DataBufferCount();
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputPropertiesForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  if (name != nullptr) {
+    *name = ti->Name().c_str();
+  }
+  if (datatype != nullptr) {
+    *datatype = DataTypeToTriton(ti->DType());
+  }
+  if (shape != nullptr) {
+    *shape = ti->ShapeWithBatchDim().data();
+  }
+  if (dims_count != nullptr) {
+    *dims_count = ti->ShapeWithBatchDim().size();
+  }
+  if (host_policy_name != nullptr) {
+    if (byte_size != nullptr) {
+      *byte_size = ti->Data(host_policy_name)->TotalByteSize();
+    }
+    if (buffer_count != nullptr) {
+      *buffer_count = ti->DataBufferCountForHostPolicy(host_policy_name);
+    }
+  } else {
+    if (byte_size != nullptr) {
+      *byte_size = ti->Data()->TotalByteSize();
+    }
+    if (buffer_count != nullptr) {
+      *buffer_count = ti->DataBufferCount();
+    }
+  }
+  return nullptr;  // success
+}
+
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBuffer(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  Status status = ti->DataBuffer(
+      index, buffer, buffer_byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_byte_size = 0;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBufferAttributes(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  Status status = ti->DataBufferAttributes(
+      index, buffer, reinterpret_cast<BufferAttributes**>(buffer_attributes));
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_attributes = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBufferForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name,
+    const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+
+  Status status =
+      (host_policy_name == nullptr)
+          ? ti->DataBuffer(
+                index, buffer, buffer_byte_size, memory_type, memory_type_id)
+          : ti->DataBufferForHostPolicy(
+                index, buffer, buffer_byte_size, memory_type, memory_type_id,
+                host_policy_name);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_byte_size = 0;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Output
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_OutputBuffer(
+    TRITONBACKEND_Output* output, void** buffer,
+    const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id)
+{
+  InferenceResponse::Output* to =
+      reinterpret_cast<InferenceResponse::Output*>(output);
+  Status status = to->AllocateDataBuffer(
+      buffer, buffer_byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_OutputBufferAttributes(
+    TRITONBACKEND_Output* output,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  InferenceResponse::Output* to =
+      reinterpret_cast<InferenceResponse::Output*>(output);
+
+  *buffer_attributes = reinterpret_cast<TRITONSERVER_BufferAttributes*>(
+      to->GetBufferAttributes());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
+    TRITONBACKEND_BackendAttribute* backend_attributes,
+    const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
+    const uint64_t* device_ids, const uint64_t id_count)
+{
+  auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
+  ba->preferred_groups_.emplace_back();
+  auto& pg = ba->preferred_groups_.back();
+  switch (kind) {
+    case TRITONSERVER_INSTANCEGROUPKIND_AUTO:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_AUTO);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_CPU:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_CPU);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_GPU:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_GPU);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_MODEL:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_MODEL);
+      break;
+  }
+  pg.set_count(count);
+  if (device_ids != nullptr) {
+    for (size_t i = 0; i < id_count; ++i) {
+      pg.add_gpus(device_ids[i]);
+    }
+  }
+  return nullptr;
+}
+
+}  // extern C
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model.h
+++ b/3rdparty/core-r22.12/src/backend_model.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "backend_manager.h"
+#include "filesystem.h"
+#include "infer_request.h"
+#include "model.h"
+#include "model_config.pb.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+class InferenceServer;
+class TritonModelInstance;
+
+//
+// Represents a model.
+//
+// Inheriting from Model to implement backend APIs
+//
+class TritonModel : public Model {
+ public:
+  static Status Create(
+      InferenceServer* server, const std::string& model_path,
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+      const std::string& model_name, const int64_t version,
+      inference::ModelConfig model_config, const bool is_config_provided,
+      std::unique_ptr<TritonModel>* model);
+  ~TritonModel();
+
+  const std::string& LocalizedModelPath() const
+  {
+    return localized_model_dir_->Path();
+  }
+  InferenceServer* Server() { return server_; }
+  bool AutoCompleteConfig() const { return auto_complete_config_; }
+  Status UpdateModelConfig(
+      const uint32_t config_version,
+      TRITONSERVER_Message* updated_config_message);
+  const std::shared_ptr<TritonBackend>& Backend() const { return backend_; }
+  const std::vector<std::unique_ptr<TritonModelInstance>>& Instances() const
+  {
+    return instances_;
+  }
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+  Status AddInstance(
+      std::unique_ptr<TritonModelInstance>&& instance, const bool passive);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonModel);
+
+  TritonModel(
+      InferenceServer* server,
+      const std::shared_ptr<LocalizedPath>& localized_model_dir,
+      const std::shared_ptr<TritonBackend>& backend,
+      const double min_compute_capability, const int64_t version,
+      const inference::ModelConfig& config, const bool auto_complete_config);
+
+  // Set the scheduler based on the model configuration. The scheduler
+  // can only be set once for a backend.
+  Status SetConfiguredScheduler();
+
+  // Merges the global backend configs with the specific
+  // backend configs.
+  static Status ResolveBackendConfigs(
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const std::string& backend_name,
+      triton::common::BackendCmdlineConfig& config);
+
+  // Sets defaults for some backend configurations when none are specified on
+  // the command line.
+  static Status SetBackendConfigDefaults(
+      triton::common::BackendCmdlineConfig& config);
+
+  Status Initialize();
+  Status WarmUp();
+
+  // The server object that owns this model. The model holds this as a
+  // raw pointer because the lifetime of the server is guaranteed to
+  // be longer than the lifetime of a model owned by the server.
+  InferenceServer* server_;
+
+  // The minimum supported compute capability on device.
+  const double min_compute_capability_;
+
+  // Whether the backend should attempt to auto-complete the model config.
+  const bool auto_complete_config_;
+
+  // The localized repo directory holding the model. If localization
+  // required creation of a temporary local copy then that copy will
+  // persist as along as this object is retained by this model.
+  std::shared_ptr<LocalizedPath> localized_model_dir_;
+
+  // Backend used by this model.
+  std::shared_ptr<TritonBackend> backend_;
+
+  // The model instances for this model.
+  std::vector<std::unique_ptr<TritonModelInstance>> instances_;
+  std::vector<std::unique_ptr<TritonModelInstance>> passive_instances_;
+
+  // Opaque state associated with this model.
+  void* state_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model_instance.cc
+++ b/3rdparty/core-r22.12/src/backend_model_instance.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_model_instance.h"
+
+#ifndef _WIN32
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#include "backend_config.h"
+#include "backend_model.h"
+#include "cuda_utils.h"
+#include "metrics.h"
+#include "model_config.pb.h"
+#include "numa_utils.h"
+#include "server.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+#include "triton/common/nvtx.h"
+#include "tritonserver_apis.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+namespace {
+// Utilities for warmup feature
+TRITONSERVER_Error*
+WarmupResponseAlloc(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
+    int64_t preferred_memory_type_id, void* userp, void** buffer,
+    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id)
+{
+  *buffer = malloc(byte_size);
+  if (*buffer != nullptr) {
+    *actual_memory_type = TRITONSERVER_MEMORY_CPU;
+    *actual_memory_type_id = 0;
+    return nullptr;
+  }
+
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_INTERNAL,
+      "failed to allocate output buffer for warmup.");
+}
+
+TRITONSERVER_Error*
+WarmupResponseRelease(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  free(buffer);
+  return nullptr;
+}
+
+ResponseAllocator warmup_allocator = ResponseAllocator(
+    WarmupResponseAlloc, WarmupResponseRelease, nullptr /* start_fn */);
+
+void
+WarmupResponseComplete(
+    TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
+    void* userp)
+{
+  auto res_pair = reinterpret_cast<
+      std::pair<std::promise<void>, std::vector<std::string>*>*>(userp);
+  if (iresponse != nullptr) {
+    auto err = TRITONSERVER_InferenceResponseError(iresponse);
+    if (err != nullptr) {
+      // The error vector is shared by all requests in the batch for now
+      static std::mutex res_mtx;
+      {
+        std::lock_guard<std::mutex> lk(res_mtx);
+        res_pair->second->emplace_back(TRITONSERVER_ErrorMessage(err));
+      }
+      TRITONSERVER_ErrorDelete(err);
+    }
+    // Just delete the response, warmup doesn't check for correctness
+    LOG_TRITONSERVER_ERROR(
+        TRITONSERVER_InferenceResponseDelete(iresponse),
+        "deleting warmup response");
+  }
+  // Last response
+  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
+    res_pair->first.set_value();
+  }
+}
+
+void
+WarmupRequestComplete(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
+{
+  if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) {
+    // Don't need to release request here, it is managed in WarmupData
+    if (userp != nullptr) {
+      auto warmup_promise = reinterpret_cast<std::promise<void>*>(userp);
+      warmup_promise->set_value();
+    }
+  }
+}
+
+}  // namespace
+
+TritonModelInstance::TritonModelInstance(
+    TritonModel* model, const std::string& name, const size_t index,
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const std::vector<std::string>& profile_names, const bool passive,
+    const triton::common::HostPolicyCmdlineConfig& host_policy,
+    const TritonServerMessage& host_policy_message,
+    const std::vector<SecondaryDevice>& secondary_devices)
+    : model_(model), name_(name), index_(index), kind_(kind),
+      device_id_(device_id), host_policy_(host_policy),
+      host_policy_message_(host_policy_message), profile_names_(profile_names),
+      passive_(passive), secondary_devices_(secondary_devices), state_(nullptr)
+{
+#ifdef TRITON_ENABLE_METRICS
+  if (Metrics::Enabled()) {
+    // Use an ID in the metric only for GPU instances. Otherwise use
+    // METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
+    // metric.
+    const int id = (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU)
+                       ? device_id_
+                       : METRIC_REPORTER_ID_CPU;
+    MetricModelReporter::Create(
+        model_->Name(), model_->Version(), id, model_->Config().metric_tags(),
+        &reporter_);
+  }
+#endif  // TRITON_ENABLE_METRICS
+}
+
+TritonModelInstance::~TritonModelInstance()
+{
+  if (triton_backend_thread_.get() != nullptr) {
+    triton_backend_thread_->StopBackendThread();
+  }
+
+  // Model finalization is optional...
+  if (model_->Backend()->ModelInstanceFiniFn() != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        model_->Backend()->ModelInstanceFiniFn()(
+            reinterpret_cast<TRITONBACKEND_ModelInstance*>(this)),
+        "failed finalizing model instance");
+  }
+}
+
+Status
+TritonModelInstance::CreateInstances(
+    TritonModel* model,
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+    const inference::ModelConfig& model_config, const bool device_blocking)
+{
+  static triton::common::HostPolicyCmdlineConfig empty_host_policy;
+
+  // This structure is used to allocate TritonBackendThread to instances on same
+  // device for device blocking execution policy.
+  std::map<uint32_t, std::shared_ptr<TritonBackendThread>> device_to_thread_map;
+
+  for (const auto& group : model_config.instance_group()) {
+    std::vector<std::string> profile_names;
+    for (const auto& profile_name : group.profile()) {
+      profile_names.push_back(profile_name);
+    }
+    std::vector<SecondaryDevice> secondary_devices;
+    for (const auto& secondary_device : group.secondary_devices()) {
+      secondary_devices.emplace_back(
+          inference::
+              ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name(
+                  secondary_device.kind()),
+          secondary_device.device_id());
+    }
+    for (int32_t c = 0; c < group.count(); ++c) {
+      std::string instance_name{group.count() > 1
+                                    ? group.name() + "_" + std::to_string(c)
+                                    : group.name()};
+      const bool passive = group.passive();
+      std::vector<std::tuple<
+          std::string, TRITONSERVER_InstanceGroupKind, int32_t,
+          const inference::ModelRateLimiter*>>
+          instance_setting;
+      if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
+        instance_setting.emplace_back(
+            group.host_policy().empty() ? "cpu" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_CPU, 0 /* device_id */,
+            &group.rate_limiter());
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
+        for (const int32_t device_id : group.gpus()) {
+          instance_setting.emplace_back(
+              group.host_policy().empty() ? ("gpu_" + std::to_string(device_id))
+                                          : group.host_policy(),
+              TRITONSERVER_INSTANCEGROUPKIND_GPU, device_id,
+              &group.rate_limiter());
+        }
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
+        instance_setting.emplace_back(
+            group.host_policy().empty() ? "model" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_MODEL, 0 /* device_id */,
+            &group.rate_limiter());
+      } else {
+        return Status(
+            Status::Code::INVALID_ARG,
+            std::string("instance_group kind ") +
+                ModelInstanceGroup_Kind_Name(group.kind()) + " not supported");
+      }
+      for (const auto is : instance_setting) {
+        const auto& kind = std::get<1>(is);
+        const auto& id = std::get<2>(is);
+
+        const std::string& policy_name = std::get<0>(is);
+        const triton::common::HostPolicyCmdlineConfig* host_policy;
+        const auto policy_it = host_policy_map.find(policy_name);
+        if (policy_it != host_policy_map.end()) {
+          host_policy = &policy_it->second;
+        } else {
+          host_policy = &empty_host_policy;
+        }
+        RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
+        auto err = CreateInstance(
+            model, instance_name, c, kind, id, profile_names, passive,
+            policy_name, *host_policy, *(std::get<3>(is)), device_blocking,
+            &device_to_thread_map, secondary_devices);
+        RETURN_IF_ERROR(ResetNumaMemoryPolicy());
+        RETURN_IF_ERROR(err);
+
+        // When deploying on GPU, we want to make sure the GPU memory usage
+        // is within allowed range, otherwise, stop the creation to ensure
+        // there is sufficient GPU memory for other use.
+        // We check the usage after loading the instance to better enforcing
+        // the limit. If we check before loading, we may create instance
+        // that occupies the rest of available memory which against the purpose
+        if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+          size_t free, total;
+          double memory_limit;
+          RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
+          RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
+              backend_cmdline_config_map, id, &memory_limit));
+          const size_t allow = total * memory_limit;
+          const size_t used = total - free;
+          if (used > allow) {
+            return Status(
+                Status::Code::UNAVAILABLE,
+                std::string("can not create model '") + instance_name +
+                    "': memory limit set for " +
+                    TRITONSERVER_InstanceGroupKindString(kind) + " " +
+                    std::to_string(id) +
+                    " has exceeded, model loading is rejected.");
+          }
+        }
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::CreateInstance(
+    TritonModel* model, const std::string& name, const size_t index,
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const std::vector<std::string>& profile_names, const bool passive,
+    const std::string& host_policy_name,
+    const triton::common::HostPolicyCmdlineConfig& host_policy,
+    const inference::ModelRateLimiter& rate_limiter_config,
+    const bool device_blocking,
+    std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+        device_to_thread_map,
+    const std::vector<SecondaryDevice>& secondary_devices)
+{
+  // Create the JSON representation of the backend configuration.
+  triton::common::TritonJson::Value host_policy_json(
+      triton::common::TritonJson::ValueType::OBJECT);
+  triton::common::TritonJson::Value policy_setting_json(
+      host_policy_json, triton::common::TritonJson::ValueType::OBJECT);
+  for (const auto& pr : host_policy) {
+    RETURN_IF_ERROR(policy_setting_json.AddString(pr.first.c_str(), pr.second));
+  }
+
+  RETURN_IF_ERROR(host_policy_json.Add(
+      host_policy_name.c_str(), std::move(policy_setting_json)));
+  TritonServerMessage host_policy_message(host_policy_json);
+
+  std::unique_ptr<TritonModelInstance> local_instance(new TritonModelInstance(
+      model, name, index, kind, device_id, profile_names, passive, host_policy,
+      host_policy_message, secondary_devices));
+
+  TRITONBACKEND_ModelInstance* triton_instance =
+      reinterpret_cast<TRITONBACKEND_ModelInstance*>(local_instance.get());
+
+  // Instance initialization is optional... We must set set shared
+  // library path to point to the backend directory in case the
+  // backend library attempts to load additional shared libaries.
+  if (model->Backend()->ModelInstanceInitFn() != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(model->Backend()->Directory()));
+
+    TRITONSERVER_Error* err =
+        model->Backend()->ModelInstanceInitFn()(triton_instance);
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  if (!passive) {
+    RETURN_IF_ERROR(local_instance->GenerateWarmupData());
+    RETURN_IF_ERROR(model->Server()->GetRateLimiter()->RegisterModelInstance(
+        local_instance.get(), rate_limiter_config));
+    RETURN_IF_ERROR(local_instance->SetBackendThread(
+        kind, device_id, device_blocking, device_to_thread_map));
+  }
+
+  RETURN_IF_ERROR(model->AddInstance(std::move(local_instance), passive));
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::SetBackendThread(
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const bool device_blocking,
+    std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+        device_to_thread_map)
+{
+  if (device_blocking && (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU)) {
+    auto thread_it = device_to_thread_map->find(device_id);
+    if (thread_it != device_to_thread_map->end()) {
+      LOG_VERBOSE(1) << "Using already started backend thread for " << Name()
+                     << " on device " << device_id;
+      triton_backend_thread_ = thread_it->second;
+    }
+  }
+  if (triton_backend_thread_.get() == nullptr) {
+    std::unique_ptr<TritonBackendThread> local_backend_thread;
+    RETURN_IF_ERROR(TritonBackendThread::CreateBackendThread(
+        Name(), this, 0 /* nice */, device_id, &local_backend_thread));
+    triton_backend_thread_ = std::move(local_backend_thread);
+    device_to_thread_map->insert({device_id, triton_backend_thread_});
+  } else {
+    triton_backend_thread_->AddModelInstance(this);
+  }
+  RETURN_IF_ERROR(triton_backend_thread_->InitAndWarmUpModelInstance(this));
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::GenerateWarmupData()
+{
+  warmup_samples_.clear();
+  for (const auto& warmup_setting : model_->Config().model_warmup()) {
+    if (warmup_setting.batch_size() == 0) {
+      LOG_VERBOSE(1) << "Skipping batch 0 warmup sample '"
+                     << warmup_setting.name() << "'";
+      continue;
+    }
+    LOG_VERBOSE(1) << "Generating warmup sample data for '"
+                   << warmup_setting.name() << "'";
+
+    // Two passes. First pass to get max byte size for synthetic
+    // data. Second pass to add original inputs and override inputs
+    // for control inputs.
+    int64_t max_zero_byte_size = 0;
+    int64_t max_random_byte_size = 0;
+    for (const auto& input_meta : warmup_setting.inputs()) {
+      auto element_count =
+          triton::common::GetElementCount(input_meta.second.dims());
+      if (element_count == -1) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "warmup setting expects all variable-size dimensions are specified "
+            "for input '" +
+                input_meta.first + "'");
+      }
+
+      int64_t batch_byte_size =
+          element_count *
+          triton::common::GetDataTypeByteSize(input_meta.second.data_type());
+      if (batch_byte_size == 0) {
+        batch_byte_size = element_count * sizeof(int32_t);
+      }
+
+      switch (input_meta.second.input_data_type_case()) {
+        case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
+          max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
+          break;
+        case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
+          // Because Triton expects STRING type to be in special format
+          // (prepend 4 bytes to specify string length), so using zero data
+          // for simplicity (4 bytes * element count of zeros).
+          if (input_meta.second.data_type() ==
+              inference::DataType::TYPE_STRING) {
+            max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
+          } else {
+            max_random_byte_size =
+                std::max(batch_byte_size, max_random_byte_size);
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    warmup_samples_.emplace_back(warmup_setting.name(), warmup_setting.count());
+    auto& warmup_data = warmup_samples_.back();
+    // Create buffers for synthetic data
+    TRITONSERVER_MemoryType type;
+    int64_t type_id;
+    warmup_data.zero_data_.reset(new AllocatedMemory(
+        max_zero_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
+        0 /* memory_type_id */));
+    char* zero_buffer = warmup_data.zero_data_->MutableBuffer(&type, &type_id);
+    memset(zero_buffer, 0, max_zero_byte_size);
+
+    warmup_data.random_data_.reset(new AllocatedMemory(
+        max_random_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
+        0 /* memory_type_id */));
+    char* random_buffer =
+        warmup_data.random_data_->MutableBuffer(&type, &type_id);
+    for (int64_t offset = 0; offset < max_random_byte_size; offset++) {
+      random_buffer[offset] = rand();
+    }
+
+    // Prepare the inference request for the specified sample, not using
+    // in-process C API because the request doesn't go through the same pipeline
+    // (i.e. no normalization / scheduler) so we need to prepare the request to
+    // the state just before calling instance execute function.
+    for (size_t cnt = 0; cnt < warmup_setting.batch_size(); cnt++) {
+      warmup_data.requests_.emplace_back(
+          new InferenceRequest(model_, model_->Version()));
+      auto& lrequest = warmup_data.requests_.back();
+
+      // Second pass to prepare original inputs.
+      std::vector<std::shared_ptr<InferenceRequest::Input>> input_sps;
+      for (const auto& input_meta : warmup_setting.inputs()) {
+        auto batch1_element_count =
+            triton::common::GetElementCount(input_meta.second.dims());
+        auto batch_byte_size =
+            batch1_element_count *
+            triton::common::GetDataTypeByteSize(input_meta.second.data_type());
+        if (batch_byte_size == 0) {
+          batch_byte_size = batch1_element_count * sizeof(int32_t);
+        }
+
+        const char* allocated_ptr;
+        switch (input_meta.second.input_data_type_case()) {
+          case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
+            allocated_ptr = zero_buffer;
+            break;
+          case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
+            if (input_meta.second.data_type() ==
+                inference::DataType::TYPE_STRING) {
+              allocated_ptr = zero_buffer;
+            } else {
+              allocated_ptr = random_buffer;
+            }
+            break;
+          }
+          case inference::ModelWarmup_Input::InputDataTypeCase::
+              kInputDataFile: {
+            // For data provided from file, we can set buffer in first pass
+            warmup_data.provided_data_.emplace_back(new std::string());
+            auto input_data = warmup_data.provided_data_.back().get();
+            RETURN_IF_ERROR(ReadTextFile(
+                JoinPath({model_->LocalizedModelPath(), kWarmupDataFolder,
+                          input_meta.second.input_data_file()}),
+                input_data));
+            if (input_meta.second.data_type() ==
+                inference::DataType::TYPE_STRING) {
+              batch_byte_size = input_data->size();
+            } else if (((size_t)batch_byte_size) > input_data->size()) {
+              return Status(
+                  Status::Code::INVALID_ARG,
+                  lrequest->LogRequest() + "warmup setting expects " +
+                      std::to_string(batch_byte_size) +
+                      " bytes, but the data "
+                      "provided from " +
+                      input_meta.second.input_data_file() + "only has " +
+                      std::to_string(input_data->size()) + " bytes");
+            }
+            allocated_ptr = input_data->data();
+            break;
+          }
+          default:
+            return Status(
+                Status::Code::INVALID_ARG,
+                lrequest->LogRequest() + "warmup setting expects input '" +
+                    input_meta.first + "' to have input_data_type set");
+        }
+
+        const inference::ModelInput* input_config;
+        bool is_original_input =
+            model_->GetInput(input_meta.first, &input_config).IsOk();
+        InferenceRequest::Input* input = nullptr;
+        std::vector<int64_t> input_meta_shape;
+        // Append batch size only if the model supports batching
+        // and not control inpt.
+        if ((model_->Config().max_batch_size() != 0) && is_original_input) {
+          input_meta_shape.push_back(1);
+        }
+        for (auto d : input_meta.second.dims()) {
+          input_meta_shape.push_back(d);
+        }
+        if (is_original_input) {
+          RETURN_IF_ERROR(lrequest->AddOriginalInput(
+              input_meta.first, input_meta.second.data_type(), input_meta_shape,
+              &input));
+        } else {
+          input_sps.emplace_back();
+          RETURN_IF_ERROR(lrequest->AddOverrideInput(
+              input_meta.first, input_meta.second.data_type(),
+              (model_->Config().max_batch_size() != 0 ? 1 : 0),
+              input_meta_shape, &input_sps.back()));
+          input = input_sps.back().get();
+        }
+        RETURN_IF_ERROR(input->AppendData(
+            allocated_ptr, batch_byte_size,
+            TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */));
+      }
+
+      RETURN_IF_ERROR(lrequest->PrepareForInference());
+      // Override inputs must be added after PrepareForInference() is called
+      for (const auto& sp : input_sps) {
+        RETURN_IF_ERROR(lrequest->AddOverrideInput(sp));
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::Schedule(
+    std::vector<std::unique_ptr<InferenceRequest>>&& requests,
+    const std::function<void()>& OnCompletion)
+{
+  // Use a thread local vector to avoid needing to malloc each
+  // time an inference is run.
+  thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
+  triton_requests.clear();
+  for (auto& r : requests) {
+    // Load the input states for the inference request.
+    r->LoadInputStates();
+    triton_requests.push_back(
+        reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
+  }
+
+  Execute(triton_requests);
+
+  OnCompletion();
+}
+
+Status
+TritonModelInstance::Initialize()
+{
+  RETURN_IF_ERROR(SetNumaConfigOnThread(HostPolicy()));
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::WarmUp()
+{
+  // move samples to local variable for scoped cleanup
+  std::vector<triton::core::TritonModelInstance::WarmupData> lwarmup_samples;
+  lwarmup_samples.swap(warmup_samples_);
+
+  for (auto& sample : lwarmup_samples) {
+    for (size_t iteration = 1; iteration <= sample.count_; ++iteration) {
+      LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
+                     << "' instance " << Name() << " is running warmup sample '"
+                     << sample.sample_name_ << "' for iteration " << iteration;
+
+      // request/response complete is asynchronous so use promise to wait for
+      // completion. Also collects error message from the responses in a vector.
+      std::vector<std::promise<void>> request_complete(sample.requests_.size());
+      std::vector<std::string> response_errors;
+      std::vector<std::pair<std::promise<void>, std::vector<std::string>*>>
+          response_complete(sample.requests_.size());
+
+      std::vector<TRITONBACKEND_Request*> triton_requests;
+      for (size_t i = 0; i < sample.requests_.size(); ++i) {
+        auto& request = sample.requests_[i];
+        request->SetReleaseCallback(
+            WarmupRequestComplete, &request_complete[i]);
+        response_complete[i].second = &response_errors;
+        request->SetResponseCallback(
+            &warmup_allocator, nullptr, WarmupResponseComplete,
+            &response_complete[i]);
+        // Capture timestamp before run to avoid incorrect accumulation from
+        // sequential warmup runs
+#ifdef TRITON_ENABLE_STATS
+        request->CaptureRequestStartNs();
+#endif  // TRITON_ENABLE_STATS
+        request->CaptureQueueStartNs();
+        triton_requests.push_back(
+            reinterpret_cast<TRITONBACKEND_Request*>(request.get()));
+      }
+
+      Execute(triton_requests);
+
+      // Wait for warmup sample to complete and check error
+      for (size_t i = 0; i < sample.requests_.size(); ++i) {
+        request_complete[i].get_future().get();
+        response_complete[i].first.get_future().get();
+      }
+      if (response_errors.size() != 0) {
+        std::string err_str =
+            "failed to run warmup sample '" + sample.sample_name_ + "': ";
+        for (const auto& error : response_errors) {
+          err_str += (error + "; ");
+        }
+        // End warmup as soon as there is failing sample
+        LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
+                       << "' instance " << Name()
+                       << " failed to run warmup sample '"
+                       << sample.sample_name_ << "'";
+        return Status(Status::Code::INVALID_ARG, err_str);
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::Execute(
+    std::vector<TRITONBACKEND_Request*>& triton_requests)
+{
+  TRITONBACKEND_ModelInstance* triton_model_instance =
+      reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);
+  TritonBackend::TritonModelInstanceExecFn_t inst_exec_fn =
+      model_->Backend()->ModelInstanceExecFn();
+
+  // If there is an error then we retain ownership of 'requests'
+  // and must send error responses.
+  TRITONSERVER_Error* err = inst_exec_fn(
+      triton_model_instance, &triton_requests[0], triton_requests.size());
+  if (err != nullptr) {
+    Status status = Status(
+        TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
+        TRITONSERVER_ErrorMessage(err));
+    for (TRITONBACKEND_Request* tr : triton_requests) {
+      std::unique_ptr<InferenceRequest> ur(
+          reinterpret_cast<InferenceRequest*>(tr));
+      InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
+    }
+
+    TRITONSERVER_ErrorDelete(err);
+  }
+}
+
+Status
+TritonModelInstance::TritonBackendThread::CreateBackendThread(
+    const std::string name, TritonModelInstance* model_instance, const int nice,
+    const int32_t device_id,
+    std::unique_ptr<TritonBackendThread>* triton_backend_thread)
+{
+  TritonBackendThread* raw_triton_backend_thread =
+      new TritonBackendThread(name, model_instance->Model());
+  std::unique_ptr<TritonBackendThread> runner(raw_triton_backend_thread);
+
+  runner->AddModelInstance(model_instance);
+  runner->backend_thread_ =
+      std::thread([raw_triton_backend_thread, nice, device_id]() {
+        raw_triton_backend_thread->BackendThread(nice, device_id);
+      });
+
+  triton_backend_thread->reset(runner.release());
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::TritonBackendThread::AddModelInstance(
+    TritonModelInstance* model_instance)
+{
+  model_instances_.push_back(model_instance);
+}
+
+Status
+TritonModelInstance::TritonBackendThread::InitAndWarmUpModelInstance(
+    TritonModelInstance* model_instance)
+{
+  // Initialize the instance on the backend thread
+  auto init_payload = model_->Server()->GetRateLimiter()->GetPayload(
+      Payload::Operation::INIT, model_instance);
+  RETURN_IF_ERROR(
+      model_->Server()->GetRateLimiter()->EnqueuePayload(model_, init_payload));
+  RETURN_IF_ERROR(init_payload->Wait());
+
+  // Warm-up the instance on the backend thread
+  auto warmup_payload = model_->Server()->GetRateLimiter()->GetPayload(
+      Payload::Operation::WARM_UP, model_instance);
+  RETURN_IF_ERROR(model_->Server()->GetRateLimiter()->EnqueuePayload(
+      model_, warmup_payload));
+  RETURN_IF_ERROR(warmup_payload->Wait());
+
+  return Status::Success;
+}
+
+TritonModelInstance::TritonBackendThread::TritonBackendThread(
+    const std::string& name, TritonModel* model)
+    : name_(name), model_(model)
+{
+}
+
+TritonModelInstance::TritonBackendThread::~TritonBackendThread()
+{
+  StopBackendThread();
+}
+
+void
+TritonModelInstance::TritonBackendThread::StopBackendThread()
+{
+  if (backend_thread_.joinable()) {
+    // Signal the backend thread to exit and then wait for it...
+    auto exit_payload = model_->Server()->GetRateLimiter()->GetPayload(
+        Payload::Operation::EXIT, model_instances_.back());
+    model_->Server()->GetRateLimiter()->EnqueuePayload(model_, exit_payload);
+    backend_thread_.join();
+  }
+}
+
+void
+TritonModelInstance::TritonBackendThread::BackendThread(
+    const int nice, const int32_t device_id)
+{
+#ifndef _WIN32
+  if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
+    LOG_VERBOSE(1) << "Starting backend thread for " << name_ << " at nice "
+                   << nice << " on device " << device_id << "...";
+  } else {
+    LOG_VERBOSE(1) << "Starting backend thread for " << name_
+                   << " at default nice (requested nice " << nice << " failed)"
+                   << " on device " << device_id << "...";
+  }
+#else
+  LOG_VERBOSE(1) << "Starting backend thread for " << name_
+                 << " at default nice on device " << device_id << "...";
+#endif
+
+  bool should_exit = false;
+  while (!should_exit) {
+    std::shared_ptr<Payload> payload;
+    model_->Server()->GetRateLimiter()->DequeuePayload(
+        model_instances_, &payload);
+    NVTX_RANGE(nvtx_, "BackendThread " + name_);
+    payload->Execute(&should_exit);
+    model_instances_.push_back(payload->GetInstance());
+    // Release the payload to the RateLimiter
+    model_->Server()->GetRateLimiter()->PayloadRelease(payload);
+  }
+  LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
+}
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceName(
+    TRITONBACKEND_ModelInstance* instance, const char** name)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *name = ti->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceKind(
+    TRITONBACKEND_ModelInstance* instance, TRITONSERVER_InstanceGroupKind* kind)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *kind = ti->Kind();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceDeviceId(
+    TRITONBACKEND_ModelInstance* instance, int32_t* device_id)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *device_id = ti->DeviceId();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceHostPolicy(
+    TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *host_policy = const_cast<TRITONSERVER_Message*>(
+      reinterpret_cast<const TRITONSERVER_Message*>(&ti->HostPolicyMessage()));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *count = ti->Profiles().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileName(
+    TRITONBACKEND_ModelInstance* instance, const uint32_t index,
+    const char** profile_name)
+{
+  *profile_name = nullptr;
+
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  const auto& rprofiles = ti->Profiles();
+  if (index >= rprofiles.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("out of bounds index ") + std::to_string(index) +
+         ": instance is configured with " + std::to_string(rprofiles.size()) +
+         " profiles")
+            .c_str());
+  }
+
+  *profile_name = rprofiles[index].c_str();
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *count = ti->SecondaryDevices().size();
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
+    TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
+    int64_t* id)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  const auto& rsecondarydevices = ti->SecondaryDevices();
+
+  if (index >= rsecondarydevices.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("out of bounds index ") + std::to_string(index) +
+         ": instance is configured with " +
+         std::to_string(rsecondarydevices.size()) + " secondary devices")
+            .c_str());
+  }
+
+  *kind = rsecondarydevices[index].kind_.c_str();
+  *id = rsecondarydevices[index].id_;
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceIsPassive(
+    TRITONBACKEND_ModelInstance* instance, bool* is_passive)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *is_passive = ti->IsPassive();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceModel(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *model = reinterpret_cast<TRITONBACKEND_Model*>(ti->Model());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceState(
+    TRITONBACKEND_ModelInstance* instance, void** state)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *state = ti->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSetState(
+    TRITONBACKEND_ModelInstance* instance, void* state)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  ti->SetState(state);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportStatistics(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
+    const bool success, const uint64_t exec_start_ns,
+    const uint64_t compute_start_ns, const uint64_t compute_end_ns,
+    const uint64_t exec_end_ns)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  tr->ReportStatistics(
+      ti->MetricReporter(), success, exec_start_ns, compute_start_ns,
+      compute_end_ns, exec_end_ns);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportBatchStatistics(
+    TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
+    const uint64_t exec_start_ns, const uint64_t compute_start_ns,
+    const uint64_t compute_end_ns, const uint64_t exec_end_ns)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  ti->Model()->MutableStatsAggregator()->UpdateInferBatchStats(
+      ti->MetricReporter(), batch_size, exec_start_ns, compute_start_ns,
+      compute_end_ns, exec_end_ns);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+}  // extern C
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model_instance.h
+++ b/3rdparty/core-r22.12/src/backend_model_instance.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <functional>
+#include <future>
+#include <memory>
+#include <string>
+#include <thread>
+#include "constants.h"
+#include "memory.h"
+#include "metric_model_reporter.h"
+#include "model_config.pb.h"
+#include "server_message.h"
+#include "status.h"
+#include "triton/common/sync_queue.h"
+
+namespace triton { namespace core {
+
+class TritonModel;
+class InferenceRequest;
+
+//
+// Represents a model instance.
+//
+class TritonModelInstance {
+ public:
+  static Status CreateInstances(
+      TritonModel* model,
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+      const inference::ModelConfig& model_config, const bool device_blocking);
+  ~TritonModelInstance();
+
+  const std::string& Name() const { return name_; }
+  size_t Index() const { return index_; }
+  TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
+  int32_t DeviceId() const { return device_id_; }
+  const triton::common::HostPolicyCmdlineConfig& HostPolicy() const
+  {
+    return host_policy_;
+  }
+  const TritonServerMessage& HostPolicyMessage() const
+  {
+    return host_policy_message_;
+  }
+  bool IsPassive() const { return passive_; }
+  const std::vector<std::string>& Profiles() const { return profile_names_; }
+
+  struct SecondaryDevice {
+    SecondaryDevice(const std::string kind, const int64_t id)
+        : kind_(kind), id_(id)
+    {
+    }
+    const std::string kind_;
+    const int64_t id_;
+  };
+  const std::vector<SecondaryDevice>& SecondaryDevices() const
+  {
+    return secondary_devices_;
+  }
+
+  Status Initialize();
+  Status WarmUp();
+  void Schedule(
+      std::vector<std::unique_ptr<InferenceRequest>>&& requests,
+      const std::function<void()>& OnCompletion);
+
+  TritonModel* Model() const { return model_; }
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  MetricModelReporter* MetricReporter() const { return reporter_.get(); }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonModelInstance);
+  class TritonBackendThread;
+  TritonModelInstance(
+      TritonModel* model, const std::string& name, const size_t index,
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const std::vector<std::string>& profile_names, const bool passive,
+      const triton::common::HostPolicyCmdlineConfig& host_policy,
+      const TritonServerMessage& host_policy_message,
+      const std::vector<SecondaryDevice>& secondary_devices);
+  static Status CreateInstance(
+      TritonModel* model, const std::string& name, const size_t index,
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const std::vector<std::string>& profile_names, const bool passive,
+      const std::string& host_policy_name,
+      const triton::common::HostPolicyCmdlineConfig& host_policy,
+      const inference::ModelRateLimiter& rate_limiter_config,
+      const bool device_blocking,
+      std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+          device_to_thread_map,
+      const std::vector<SecondaryDevice>& secondary_devices);
+  Status SetBackendThread(
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const bool device_blocking,
+      std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+          device_to_thread_map);
+  Status GenerateWarmupData();
+
+  void Execute(std::vector<TRITONBACKEND_Request*>& triton_requests);
+
+  class TritonBackendThread {
+   public:
+    static Status CreateBackendThread(
+        const std::string name, TritonModelInstance* model, const int nice,
+        const int32_t device_id,
+        std::unique_ptr<TritonBackendThread>* triton_backend_thread);
+    void AddModelInstance(TritonModelInstance* model_instance);
+    Status InitAndWarmUpModelInstance(TritonModelInstance* model_instance);
+    void StopBackendThread();
+    ~TritonBackendThread();
+
+   private:
+    TritonBackendThread(const std::string& name, TritonModel* model);
+    void BackendThread(const int nice, const int32_t device_id);
+
+    std::string name_;
+
+    TritonModel* model_;
+    std::deque<TritonModelInstance*> model_instances_;
+
+    std::thread backend_thread_;
+    std::atomic<bool> backend_thread_exit_;
+  };
+  std::shared_ptr<TritonBackendThread> triton_backend_thread_;
+
+  struct WarmupData {
+    WarmupData(const std::string& sample_name, const size_t count)
+        : sample_name_(sample_name), count_(std::max(count, size_t{1}))
+    {
+    }
+
+    std::string sample_name_;
+    size_t count_;
+    // Using a batch of requests to satisfy batch size, this provides better
+    // alignment on the batch expected by the model, especially for sequence
+    // model.
+    std::vector<std::unique_ptr<InferenceRequest>> requests_;
+
+    // Placeholder for input data
+    std::unique_ptr<AllocatedMemory> zero_data_;
+    std::unique_ptr<AllocatedMemory> random_data_;
+    std::vector<std::unique_ptr<std::string>> provided_data_;
+  };
+  std::vector<WarmupData> warmup_samples_;
+
+  // The TritonModel object that owns this instance. The instance
+  // holds this as a raw pointer because the lifetime of the model is
+  // guaranteed to be longer than the lifetime of an instance owned by the
+  // model.
+  TritonModel* model_;
+
+  std::string name_;
+  size_t index_;
+
+  // For CPU device_id_ is always 0. For GPU device_id_ indicates the
+  // GPU device to be used by the instance.
+  TRITONSERVER_InstanceGroupKind kind_;
+  int32_t device_id_;
+  const triton::common::HostPolicyCmdlineConfig host_policy_;
+  TritonServerMessage host_policy_message_;
+  std::vector<std::string> profile_names_;
+  bool passive_;
+
+  std::vector<SecondaryDevice> secondary_devices_;
+
+  // Reporter for metrics, or nullptr if no metrics should be reported
+  std::shared_ptr<MetricModelReporter> reporter_;
+
+  // Opaque state associated with this model instance.
+  void* state_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/buffer_attributes.cc
+++ b/3rdparty/core-r22.12/src/buffer_attributes.cc
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "buffer_attributes.h"
+
+#include <cstring>
+#include "constants.h"
+
+namespace triton { namespace core {
+void
+BufferAttributes::SetByteSize(const size_t& byte_size)
+{
+  byte_size_ = byte_size;
+}
+
+void
+BufferAttributes::SetMemoryType(const TRITONSERVER_MemoryType& memory_type)
+{
+  memory_type_ = memory_type;
+}
+
+void
+BufferAttributes::SetMemoryTypeId(const int64_t& memory_type_id)
+{
+  memory_type_id_ = memory_type_id;
+}
+
+void
+BufferAttributes::SetCudaIpcHandle(void* cuda_ipc_handle)
+{
+  char* lcuda_ipc_handle = reinterpret_cast<char*>(cuda_ipc_handle);
+  cuda_ipc_handle_.clear();
+  std::copy(
+      lcuda_ipc_handle, lcuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
+      std::back_inserter(cuda_ipc_handle_));
+}
+
+void*
+BufferAttributes::CudaIpcHandle()
+{
+  if (cuda_ipc_handle_.empty()) {
+    return nullptr;
+  } else {
+    return reinterpret_cast<void*>(cuda_ipc_handle_.data());
+  }
+}
+
+size_t
+BufferAttributes::ByteSize() const
+{
+  return byte_size_;
+}
+
+TRITONSERVER_MemoryType
+BufferAttributes::MemoryType() const
+{
+  return memory_type_;
+}
+
+int64_t
+BufferAttributes::MemoryTypeId() const
+{
+  return memory_type_id_;
+}
+
+BufferAttributes::BufferAttributes(
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, char* cuda_ipc_handle)
+    : byte_size_(byte_size), memory_type_(memory_type),
+      memory_type_id_(memory_type_id)
+{
+  // cuda ipc handle size
+  cuda_ipc_handle_.reserve(CUDA_IPC_STRUCT_SIZE);
+
+  if (cuda_ipc_handle != nullptr) {
+    std::copy(
+        cuda_ipc_handle, cuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
+        std::back_inserter(cuda_ipc_handle_));
+  }
+}
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/buffer_attributes.h
+++ b/3rdparty/core-r22.12/src/buffer_attributes.h
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <iterator>
+#include <vector>
+#include "tritonserver_apis.h"
+
+#pragma once
+
+namespace triton { namespace core {
+//
+// A class to hold information about the buffer allocation.
+//
+class BufferAttributes {
+ public:
+  BufferAttributes(
+      size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id, char cuda_ipc_handle[64]);
+  BufferAttributes()
+  {
+    memory_type_ = TRITONSERVER_MEMORY_CPU;
+    memory_type_id_ = 0;
+    cuda_ipc_handle_.reserve(64);
+  }
+
+  // Set the buffer byte size
+  void SetByteSize(const size_t& byte_size);
+
+  // Set the buffer memory_type
+  void SetMemoryType(const TRITONSERVER_MemoryType& memory_type);
+
+  // Set the buffer memory type id
+  void SetMemoryTypeId(const int64_t& memory_type_id);
+
+  // Set the cuda ipc handle
+  void SetCudaIpcHandle(void* cuda_ipc_handle);
+
+  // Get the cuda ipc handle
+  void* CudaIpcHandle();
+
+  // Get the byte size
+  size_t ByteSize() const;
+
+  // Get the memory type
+  TRITONSERVER_MemoryType MemoryType() const;
+
+  // Get the memory type id
+  int64_t MemoryTypeId() const;
+
+ private:
+  size_t byte_size_;
+  TRITONSERVER_MemoryType memory_type_;
+  int64_t memory_type_id_;
+  std::vector<char> cuda_ipc_handle_;
+};
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/constants.h
+++ b/3rdparty/core-r22.12/src/constants.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <stdint.h>
+
+namespace triton { namespace core {
+
+constexpr char kInferHeaderContentLengthHTTPHeader[] =
+    "Inference-Header-Content-Length";
+constexpr char kAcceptEncodingHTTPHeader[] = "Accept-Encoding";
+constexpr char kContentEncodingHTTPHeader[] = "Content-Encoding";
+constexpr char kContentTypeHeader[] = "Content-Type";
+constexpr char kContentLengthHeader[] = "Content-Length";
+
+constexpr char kTensorFlowGraphDefPlatform[] = "tensorflow_graphdef";
+constexpr char kTensorFlowSavedModelPlatform[] = "tensorflow_savedmodel";
+constexpr char kTensorFlowGraphDefFilename[] = "model.graphdef";
+constexpr char kTensorFlowSavedModelFilename[] = "model.savedmodel";
+constexpr char kTensorFlowBackend[] = "tensorflow";
+
+constexpr char kTensorRTPlanPlatform[] = "tensorrt_plan";
+constexpr char kTensorRTPlanFilename[] = "model.plan";
+constexpr char kTensorRTBackend[] = "tensorrt";
+
+constexpr char kOnnxRuntimeOnnxPlatform[] = "onnxruntime_onnx";
+constexpr char kOnnxRuntimeOnnxFilename[] = "model.onnx";
+constexpr char kOnnxRuntimeBackend[] = "onnxruntime";
+
+constexpr char kOpenVINORuntimeOpenVINOFilename[] = "model.xml";
+constexpr char kOpenVINORuntimeBackend[] = "openvino";
+
+constexpr char kPyTorchLibTorchPlatform[] = "pytorch_libtorch";
+constexpr char kPyTorchLibTorchFilename[] = "model.pt";
+constexpr char kPyTorchBackend[] = "pytorch";
+
+constexpr char kPythonFilename[] = "model.py";
+constexpr char kPythonBackend[] = "python";
+
+#ifdef TRITON_ENABLE_ENSEMBLE
+constexpr char kEnsemblePlatform[] = "ensemble";
+#endif  // TRITON_ENABLE_ENSEMBLE
+
+constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
+constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
+constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
+constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
+    "auto_mixed_precision";
+
+constexpr char kModelConfigPbTxt[] = "config.pbtxt";
+
+constexpr char kMetricsLabelModelName[] = "model";
+constexpr char kMetricsLabelModelVersion[] = "version";
+constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";
+
+constexpr char kWarmupDataFolder[] = "warmup";
+constexpr char kInitialStateFolder[] = "initial_state";
+
+constexpr uint64_t NANOS_PER_SECOND = 1000000000;
+constexpr uint64_t NANOS_PER_MILLIS = 1000000;
+constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
+constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000;
+constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128;
+constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
+
+#ifdef TRITON_ENABLE_METRICS
+// MetricModelReporter expects a device ID for GPUs, but we reuse this device
+// ID for other metrics as well such as for CPU and Response Cache metrics
+constexpr int METRIC_REPORTER_ID_CPU = -1;
+constexpr int METRIC_REPORTER_ID_RESPONSE_CACHE = -2;
+#endif
+
+#define TIMESPEC_TO_NANOS(TS) \
+  ((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
+#define TIMESPEC_TO_MILLIS(TS) \
+  (TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
+
+#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
+#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
+#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  DISALLOW_COPY(TypeName)                  \
+  DISALLOW_ASSIGN(TypeName)
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_memory_manager.cc
+++ b/3rdparty/core-r22.12/src/cuda_memory_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#include "cuda_memory_manager.h"
+
+#include <cnmem.h>
+#include <string.h>
+#include <set>
+#include "cuda_utils.h"
+#include "triton/common/logging.h"
+
+namespace {
+
+#define RETURN_IF_CNMEM_ERROR(S, MSG)                    \
+  do {                                                   \
+    auto status__ = (S);                                 \
+    if (status__ != CNMEM_STATUS_SUCCESS) {              \
+      return Status(                                     \
+          Status::Code::INTERNAL,                        \
+          (MSG) + ": " + cnmemGetErrorString(status__)); \
+    }                                                    \
+  } while (false)
+
+std::string
+PointerToString(void* ptr)
+{
+  std::stringstream ss;
+  ss << ptr;
+  return ss.str();
+}
+
+}  // namespace
+
+namespace triton { namespace core {
+
+std::unique_ptr<CudaMemoryManager> CudaMemoryManager::instance_;
+std::mutex CudaMemoryManager::instance_mu_;
+
+CudaMemoryManager::~CudaMemoryManager()
+{
+  if (has_allocation_) {
+    auto status = cnmemFinalize();
+    if (status != CNMEM_STATUS_SUCCESS) {
+      LOG_ERROR << "Failed to finalize CUDA memory manager: [" << status << "] "
+                << cnmemGetErrorString(status);
+    }
+  }
+}
+
+void
+CudaMemoryManager::Reset()
+{
+  std::lock_guard<std::mutex> lock(instance_mu_);
+  instance_.reset();
+}
+
+Status
+CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
+{
+  // Ensure thread-safe creation of CUDA memory pool
+  std::lock_guard<std::mutex> lock(instance_mu_);
+  if (instance_ != nullptr) {
+    LOG_WARNING << "New CUDA memory pools could not be created since they "
+                   "already exists";
+    return Status::Success;
+  }
+
+  std::set<int> supported_gpus;
+  auto status = GetSupportedGPUs(
+      &supported_gpus, options.min_supported_compute_capability_);
+  if (status.IsOk()) {
+    std::vector<cnmemDevice_t> devices;
+    for (auto gpu : supported_gpus) {
+      const auto it = options.memory_pool_byte_size_.find(gpu);
+      if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
+        devices.emplace_back();
+        auto& device = devices.back();
+        memset(&device, 0, sizeof(device));
+        device.device = gpu;
+        device.size = it->second;
+
+        LOG_INFO << "CUDA memory pool is created on device " << device.device
+                 << " with size " << device.size;
+      }
+    }
+
+    if (!devices.empty()) {
+      RETURN_IF_CNMEM_ERROR(
+          cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
+          std::string("Failed to finalize CUDA memory manager"));
+    } else {
+      LOG_INFO << "CUDA memory pool disabled";
+    }
+
+    // Use to finalize CNMeM properly when out of scope
+    instance_.reset(new CudaMemoryManager(!devices.empty()));
+  } else {
+    return Status(
+        status.ErrorCode(),
+        "Failed to initialize CUDA memory manager: " + status.Message());
+  }
+
+  return Status::Success;
+}
+
+Status
+CudaMemoryManager::Alloc(void** ptr, uint64_t size, int64_t device_id)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
+  } else if (!instance_->has_allocation_) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        "CudaMemoryManager has no preallocated CUDA memory");
+  }
+
+  int current_device;
+  RETURN_IF_CUDA_ERR(
+      cudaGetDevice(&current_device), std::string("Failed to get device"));
+  bool overridden = (current_device != device_id);
+  if (overridden) {
+    RETURN_IF_CUDA_ERR(
+        cudaSetDevice(device_id), std::string("Failed to set device"));
+  }
+
+  // Defer returning error to make sure the device is recovered
+  auto err = cnmemMalloc(ptr, size, nullptr);
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  RETURN_IF_CNMEM_ERROR(
+      err, std::string("Failed to allocate CUDA memory with byte size ") +
+               std::to_string(size) + " on GPU " + std::to_string(device_id));
+  return Status::Success;
+}
+
+Status
+CudaMemoryManager::Free(void* ptr, int64_t device_id)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
+  } else if (!instance_->has_allocation_) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        "CudaMemoryManager has no preallocated CUDA memory");
+  }
+
+  int current_device;
+  RETURN_IF_CUDA_ERR(
+      cudaGetDevice(&current_device), std::string("Failed to get device"));
+  bool overridden = (current_device != device_id);
+  if (overridden) {
+    RETURN_IF_CUDA_ERR(
+        cudaSetDevice(device_id), std::string("Failed to set device"));
+  }
+
+  // Defer returning error to make sure the device is recovered
+  auto err = cnmemFree(ptr, nullptr);
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  RETURN_IF_CNMEM_ERROR(
+      err, std::string("Failed to deallocate CUDA memory at address ") +
+               PointerToString(ptr) + " on GPU " + std::to_string(device_id));
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_memory_manager.h
+++ b/3rdparty/core-r22.12/src/cuda_memory_manager.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include "status.h"
+
+namespace triton { namespace core {
+
+// This is a singleton class responsible for maintaining CUDA memory pool
+// used by the inference server. CUDA memory allocations and deallocations
+// must be requested via functions provided by this class.
+class CudaMemoryManager {
+ public:
+  // Options to configure CUDA memory manager.
+  struct Options {
+    Options(double cc = 6.0, const std::map<int, uint64_t>& s = {})
+        : min_supported_compute_capability_(cc), memory_pool_byte_size_(s)
+    {
+    }
+
+    // The minimum compute capability of the supported devices.
+    double min_supported_compute_capability_;
+
+    // The size of CUDA memory reserved for the specified devices.
+    // The memory size will be rounded up to align with
+    // the default granularity (512 bytes).
+    // No memory will be reserved for devices that is not listed.
+    std::map<int, uint64_t> memory_pool_byte_size_;
+  };
+
+  ~CudaMemoryManager();
+
+  // Create the memory manager based on 'options' specified.
+  // Return Status object indicating success or failure.
+  static Status Create(const Options& options);
+
+  // Allocate CUDA memory on GPU 'device_id' with
+  // the requested 'size' and return the pointer in 'ptr'.
+  // Return Status object indicating success or failure.
+  static Status Alloc(void** ptr, uint64_t size, int64_t device_id);
+
+  // Free the memory allocated by the memory manager on 'device_id'.
+  // Return Status object indicating success or failure.
+  static Status Free(void* ptr, int64_t device_id);
+
+ protected:
+  // Provide explicit control on the lifecycle of the CUDA memory manager,
+  // for testing only.
+  static void Reset();
+
+ private:
+  CudaMemoryManager(bool has_allocation) : has_allocation_(has_allocation) {}
+  bool has_allocation_;
+  static std::unique_ptr<CudaMemoryManager> instance_;
+  static std::mutex instance_mu_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_utils.cc
+++ b/3rdparty/core-r22.12/src/cuda_utils.cc
+// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "cuda_utils.h"
+
+#include "model_config_utils.h"
+#include "triton/common/nvtx.h"
+
+namespace triton { namespace core {
+
+#ifdef TRITON_ENABLE_GPU
+void CUDART_CB
+MemcpyHost(void* args)
+{
+  auto* copy_params = reinterpret_cast<CopyParams*>(args);
+  memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
+  delete copy_params;
+}
+#endif  // TRITON_ENABLE_GPU
+
+Status
+GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total)
+{
+  *free = 0;
+  *total = 0;
+#ifdef TRITON_ENABLE_GPU
+  // Make sure that correct device is set before creating stream and
+  // then restore the device to what was set by the caller.
+  int current_device;
+  auto cuerr = cudaGetDevice(&current_device);
+  bool overridden = false;
+  if (cuerr == cudaSuccess) {
+    overridden = (current_device != device_id);
+    if (overridden) {
+      cuerr = cudaSetDevice(device_id);
+    }
+  }
+
+  if (cuerr == cudaSuccess) {
+    cuerr = cudaMemGetInfo(free, total);
+  }
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        (std::string("unable to get memory info for device ") +
+         std::to_string(device_id) + ": " + cudaGetErrorString(cuerr)));
+  }
+#endif  // TRITON_ENABLE_GPU
+  return Status::Success;
+}
+
+Status
+EnablePeerAccess(const double min_compute_capability)
+{
+#ifdef TRITON_ENABLE_GPU
+  // If we can't enable peer access for one device pair, the best we can
+  // do is skipping it...
+  std::set<int> supported_gpus;
+  bool all_enabled = false;
+  if (GetSupportedGPUs(&supported_gpus, min_compute_capability).IsOk()) {
+    all_enabled = true;
+    int can_access_peer = false;
+    for (const auto& host : supported_gpus) {
+      auto cuerr = cudaSetDevice(host);
+
+      if (cuerr == cudaSuccess) {
+        for (const auto& peer : supported_gpus) {
+          if (host == peer) {
+            continue;
+          }
+
+          cuerr = cudaDeviceCanAccessPeer(&can_access_peer, host, peer);
+          if ((cuerr == cudaSuccess) && (can_access_peer == 1)) {
+            cuerr = cudaDeviceEnablePeerAccess(peer, 0);
+          }
+
+          all_enabled &= ((cuerr == cudaSuccess) && (can_access_peer == 1));
+        }
+      }
+    }
+  }
+  if (!all_enabled) {
+    return Status(
+        Status::Code::UNSUPPORTED,
+        "failed to enable peer access for some device pairs");
+  }
+#endif  // TRITON_ENABLE_GPU
+  return Status::Success;
+}
+
+Status
+CopyBuffer(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, bool* cuda_used, bool copy_on_stream)
+{
+  NVTX_RANGE(nvtx_, "CopyBuffer");
+
+  *cuda_used = false;
+
+  // For CUDA memcpy, all host to host copy will be blocked in respect to the
+  // host, so use memcpy() directly. In this case, need to be careful on whether
+  // the src buffer is valid.
+  if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
+      (dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
+#ifdef TRITON_ENABLE_GPU
+    if (copy_on_stream) {
+      auto params = new CopyParams(dst, src, byte_size);
+      cudaLaunchHostFunc(
+          cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
+      *cuda_used = true;
+    } else {
+      memcpy(dst, src, byte_size);
+    }
+#else
+    memcpy(dst, src, byte_size);
+#endif  // TRITON_ENABLE_GPU
+  } else {
+#ifdef TRITON_ENABLE_GPU
+    RETURN_IF_CUDA_ERR(
+        cudaMemcpyAsync(dst, src, byte_size, cudaMemcpyDefault, cuda_stream),
+        msg + ": failed to perform CUDA copy");
+
+    *cuda_used = true;
+#else
+    return Status(
+        Status::Code::INTERNAL,
+        msg + ": try to use CUDA copy while GPU is not supported");
+#endif  // TRITON_ENABLE_GPU
+  }
+
+  return Status::Success;
+}
+
+void
+CopyBufferHandler(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, void* response_ptr,
+    triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
+        completion_queue)
+{
+  bool cuda_used = false;
+  Status status = CopyBuffer(
+      msg, src_memory_type, src_memory_type_id, dst_memory_type,
+      dst_memory_type_id, byte_size, src, dst, cuda_stream, &cuda_used);
+  completion_queue->Put(std::make_tuple(status, cuda_used, response_ptr));
+}
+
+#ifdef TRITON_ENABLE_GPU
+Status
+CheckGPUCompatibility(const int gpu_id, const double min_compute_capability)
+{
+  // Query the compute capability from the device
+  cudaDeviceProp cuprops;
+  cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to get CUDA device properties for GPU ID" +
+            std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
+  }
+
+  double compute_compability = cuprops.major + (cuprops.minor / 10.0);
+  if ((compute_compability > min_compute_capability) ||
+      (abs(compute_compability - min_compute_capability) < 0.01)) {
+    return Status::Success;
+  } else {
+    return Status(
+        Status::Code::UNSUPPORTED,
+        "gpu " + std::to_string(gpu_id) + " has compute capability '" +
+            std::to_string(cuprops.major) + "." +
+            std::to_string(cuprops.minor) +
+            "' which is less than the minimum supported of '" +
+            std::to_string(min_compute_capability) + "'");
+  }
+}
+
+Status
+GetSupportedGPUs(
+    std::set<int>* supported_gpus, const double min_compute_capability)
+{
+  // Make sure set is empty before starting
+  supported_gpus->clear();
+
+  int device_cnt;
+  cudaError_t cuerr = cudaGetDeviceCount(&device_cnt);
+  if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) {
+    device_cnt = 0;
+  } else if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL, "unable to get number of CUDA devices: " +
+                                    std::string(cudaGetErrorString(cuerr)));
+  }
+
+  // populates supported_gpus
+  for (int gpu_id = 0; gpu_id < device_cnt; gpu_id++) {
+    Status status = CheckGPUCompatibility(gpu_id, min_compute_capability);
+    if (status.IsOk()) {
+      supported_gpus->insert(gpu_id);
+    }
+  }
+  return Status::Success;
+}
+
+Status
+SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support)
+{
+  // Query the device to check if integrated
+  cudaDeviceProp cuprops;
+  cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to get CUDA device properties for GPU ID" +
+            std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
+  }
+
+  // Zero-copy supported only on integrated GPU when it can map host memory
+  if (cuprops.integrated && cuprops.canMapHostMemory) {
+    *zero_copy_support = true;
+  } else {
+    *zero_copy_support = false;
+  }
+
+  return Status::Success;
+}
+
+#endif
+
+}}  // namespace triton::core