添加下载的代码

b30f3cdb · xiabo · e38ee081 · b30f3cdb · b30f3cdb · b30f3cdb
Commit b30f3cdb authored Nov 14, 2023 by xiabo
20 changed files
--- a/3rdparty/core-r22.12/include/triton/core/tritonserver.h
+++ b/3rdparty/core-r22.12/include/triton/core/tritonserver.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+/// \file
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _COMPILING_TRITONSERVER
+#if defined(_MSC_VER)
+#define TRITONSERVER_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONSERVER_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONSERVER_DECLSPEC
+#endif
+#else
+#if defined(_MSC_VER)
+#define TRITONSERVER_DECLSPEC __declspec(dllimport)
+#else
+#define TRITONSERVER_DECLSPEC
+#endif
+#endif
+
+struct TRITONSERVER_BufferAttributes;
+struct TRITONSERVER_Error;
+struct TRITONSERVER_InferenceRequest;
+struct TRITONSERVER_InferenceResponse;
+struct TRITONSERVER_InferenceTrace;
+struct TRITONSERVER_Message;
+struct TRITONSERVER_Metrics;
+struct TRITONSERVER_Parameter;
+struct TRITONSERVER_ResponseAllocator;
+struct TRITONSERVER_Server;
+struct TRITONSERVER_ServerOptions;
+struct TRITONSERVER_Metric;
+struct TRITONSERVER_MetricFamily;
+
+///
+/// TRITONSERVER API Version
+///
+/// The TRITONSERVER API is versioned with major and minor version
+/// numbers. Any change to the API that does not impact backwards
+/// compatibility (for example, adding a non-required function)
+/// increases the minor version number. Any change that breaks
+/// backwards compatibility (for example, deleting or changing the
+/// behavior of a function) increases the major version number. A
+/// client should check that the API version used to compile the
+/// client is compatible with the API version of the Triton shared
+/// library that it is linking against. This is typically done by code
+/// similar to the following which makes sure that the major versions
+/// are equal and that the minor version of the Triton shared library
+/// is >= the minor version used to build the client.
+///
+///   uint32_t api_version_major, api_version_minor;
+///   TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
+///   if ((api_version_major != TRITONSERVER_API_VERSION_MAJOR) ||
+///       (api_version_minor < TRITONSERVER_API_VERSION_MINOR)) {
+///     return TRITONSERVER_ErrorNew(
+///       TRITONSERVER_ERROR_UNSUPPORTED,
+///       "triton server API version does not support this client");
+///   }
+///
+#define TRITONSERVER_API_VERSION_MAJOR 1
+#define TRITONSERVER_API_VERSION_MINOR 17
+
+/// Get the TRITONBACKEND API version supported by the Triton shared
+/// library. This value can be compared against the
+/// TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR
+/// used to build the client to ensure that Triton shared library is
+/// compatible with the client.
+///
+/// \param major Returns the TRITONSERVER API major version supported
+/// by Triton.
+/// \param minor Returns the TRITONSERVER API minor version supported
+/// by Triton.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ApiVersion(
+    uint32_t* major, uint32_t* minor);
+
+/// TRITONSERVER_DataType
+///
+/// Tensor data types recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_datatype_enum {
+  TRITONSERVER_TYPE_INVALID,
+  TRITONSERVER_TYPE_BOOL,
+  TRITONSERVER_TYPE_UINT8,
+  TRITONSERVER_TYPE_UINT16,
+  TRITONSERVER_TYPE_UINT32,
+  TRITONSERVER_TYPE_UINT64,
+  TRITONSERVER_TYPE_INT8,
+  TRITONSERVER_TYPE_INT16,
+  TRITONSERVER_TYPE_INT32,
+  TRITONSERVER_TYPE_INT64,
+  TRITONSERVER_TYPE_FP16,
+  TRITONSERVER_TYPE_FP32,
+  TRITONSERVER_TYPE_FP64,
+  TRITONSERVER_TYPE_BYTES,
+  TRITONSERVER_TYPE_BF16
+} TRITONSERVER_DataType;
+
+/// Get the string representation of a data type. The returned string
+/// is not owned by the caller and so should not be modified or freed.
+///
+/// \param datatype The data type.
+/// \return The string representation of the data type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_DataTypeString(
+    TRITONSERVER_DataType datatype);
+
+/// Get the Triton datatype corresponding to a string representation
+/// of a datatype.
+///
+/// \param dtype The datatype string representation.
+/// \return The Triton data type or TRITONSERVER_TYPE_INVALID if the
+/// string does not represent a data type.
+TRITONSERVER_DECLSPEC TRITONSERVER_DataType
+TRITONSERVER_StringToDataType(const char* dtype);
+
+/// Get the size of a Triton datatype in bytes. Zero is returned for
+/// TRITONSERVER_TYPE_BYTES because it have variable size. Zero is
+/// returned for TRITONSERVER_TYPE_INVALID.
+///
+/// \param dtype The datatype.
+/// \return The size of the datatype.
+TRITONSERVER_DECLSPEC uint32_t
+TRITONSERVER_DataTypeByteSize(TRITONSERVER_DataType datatype);
+
+/// TRITONSERVER_MemoryType
+///
+/// Types of memory recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_memorytype_enum {
+  TRITONSERVER_MEMORY_CPU,
+  TRITONSERVER_MEMORY_CPU_PINNED,
+  TRITONSERVER_MEMORY_GPU
+} TRITONSERVER_MemoryType;
+
+/// Get the string representation of a memory type. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param memtype The memory type.
+/// \return The string representation of the memory type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_MemoryTypeString(
+    TRITONSERVER_MemoryType memtype);
+
+/// TRITONSERVER_ParameterType
+///
+/// Types of parameters recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_parametertype_enum {
+  TRITONSERVER_PARAMETER_STRING,
+  TRITONSERVER_PARAMETER_INT,
+  TRITONSERVER_PARAMETER_BOOL,
+  TRITONSERVER_PARAMETER_BYTES
+} TRITONSERVER_ParameterType;
+
+/// Get the string representation of a parameter type. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param paramtype The parameter type.
+/// \return The string representation of the parameter type.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ParameterTypeString(
+    TRITONSERVER_ParameterType paramtype);
+
+/// Create a new parameter object. The caller takes ownership of the
+/// TRITONSERVER_Parameter object and must call TRITONSERVER_ParameterDelete to
+/// release the object. The object will maintain its own copy of the 'value'
+///
+/// \param name The parameter name.
+/// \param type The parameter type.
+/// \param value The pointer to the value.
+/// \return A new TRITONSERVER_Parameter object. 'nullptr' will be returned if
+/// 'type' is 'TRITONSERVER_PARAMETER_BYTES'. The caller should use
+/// TRITONSERVER_ParameterBytesNew to create parameter with bytes type.
+TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterNew(
+    const char* name, const TRITONSERVER_ParameterType type, const void* value);
+
+/// Create a new parameter object with type TRITONSERVER_PARAMETER_BYTES.
+/// The caller takes ownership of the TRITONSERVER_Parameter object and must
+/// call TRITONSERVER_ParameterDelete to release the object. The object only
+/// maintains a shallow copy of the 'byte_ptr' so the data content must be
+/// valid until the parameter object is deleted.
+///
+/// \param name The parameter name.
+/// \param byte_ptr The pointer to the data content.
+/// \param size The size of the data content.
+/// \return A new TRITONSERVER_Error object.
+TRITONSERVER_DECLSPEC TRITONSERVER_Parameter* TRITONSERVER_ParameterBytesNew(
+    const char* name, const void* byte_ptr, const uint64_t size);
+
+/// Delete an parameter object.
+///
+/// \param parameter The parameter object.
+TRITONSERVER_DECLSPEC void TRITONSERVER_ParameterDelete(
+    TRITONSERVER_Parameter* parameter);
+
+/// TRITONSERVER_InstanceGroupKind
+///
+/// Kinds of instance groups recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_instancegroupkind_enum {
+  TRITONSERVER_INSTANCEGROUPKIND_AUTO,
+  TRITONSERVER_INSTANCEGROUPKIND_CPU,
+  TRITONSERVER_INSTANCEGROUPKIND_GPU,
+  TRITONSERVER_INSTANCEGROUPKIND_MODEL
+} TRITONSERVER_InstanceGroupKind;
+
+/// Get the string representation of an instance-group kind. The
+/// returned string is not owned by the caller and so should not be
+/// modified or freed.
+///
+/// \param kind The instance-group kind.
+/// \return The string representation of the kind.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InstanceGroupKindString(
+    TRITONSERVER_InstanceGroupKind kind);
+
+/// TRITONSERVER_Logging
+///
+/// Types/levels of logging.
+///
+typedef enum TRITONSERVER_loglevel_enum {
+  TRITONSERVER_LOG_INFO,
+  TRITONSERVER_LOG_WARN,
+  TRITONSERVER_LOG_ERROR,
+  TRITONSERVER_LOG_VERBOSE
+} TRITONSERVER_LogLevel;
+
+///
+/// Format of logging.
+///
+/// TRITONSERVER_LOG_DEFAULT: the log severity (L) and timestamp will be
+/// logged as "LMMDD hh:mm:ss.ssssss".
+///
+/// TRITONSERVER_LOG_ISO8601: the log format will be "YYYY-MM-DDThh:mm:ssZ L".
+///
+typedef enum TRITONSERVER_logformat_enum {
+  TRITONSERVER_LOG_DEFAULT,
+  TRITONSERVER_LOG_ISO8601
+} TRITONSERVER_LogFormat;
+
+/// Is a log level enabled?
+///
+/// \param level The log level.
+/// \return True if the log level is enabled, false if not enabled.
+TRITONSERVER_DECLSPEC bool TRITONSERVER_LogIsEnabled(
+    TRITONSERVER_LogLevel level);
+
+/// Log a message at a given log level if that level is enabled.
+///
+/// \param level The log level.
+/// \param filename The file name of the location of the log message.
+/// \param line The line number of the log message.
+/// \param msg The log message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_LogMessage(
+    TRITONSERVER_LogLevel level, const char* filename, const int line,
+    const char* msg);
+
+/// TRITONSERVER_Error
+///
+/// Errors are reported by a TRITONSERVER_Error object. A NULL
+/// TRITONSERVER_Error indicates no error, a non-NULL TRITONSERVER_Error
+/// indicates error and the code and message for the error can be
+/// retrieved from the object.
+///
+/// The caller takes ownership of a TRITONSERVER_Error object returned by
+/// the API and must call TRITONSERVER_ErrorDelete to release the object.
+///
+
+/// The TRITONSERVER_Error error codes
+typedef enum TRITONSERVER_errorcode_enum {
+  TRITONSERVER_ERROR_UNKNOWN,
+  TRITONSERVER_ERROR_INTERNAL,
+  TRITONSERVER_ERROR_NOT_FOUND,
+  TRITONSERVER_ERROR_INVALID_ARG,
+  TRITONSERVER_ERROR_UNAVAILABLE,
+  TRITONSERVER_ERROR_UNSUPPORTED,
+  TRITONSERVER_ERROR_ALREADY_EXISTS
+} TRITONSERVER_Error_Code;
+
+/// Create a new error object. The caller takes ownership of the
+/// TRITONSERVER_Error object and must call TRITONSERVER_ErrorDelete to
+/// release the object.
+///
+/// \param code The error code.
+/// \param msg The error message.
+/// \return A new TRITONSERVER_Error object.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ErrorNew(
+    TRITONSERVER_Error_Code code, const char* msg);
+
+/// Delete an error object.
+///
+/// \param error The error object.
+TRITONSERVER_DECLSPEC void TRITONSERVER_ErrorDelete(TRITONSERVER_Error* error);
+
+/// Get the error code.
+///
+/// \param error The error object.
+/// \return The error code.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error_Code
+TRITONSERVER_ErrorCode(TRITONSERVER_Error* error);
+
+/// Get the string representation of an error code. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed. The lifetime of the returned string extends only as long as
+/// 'error' and must not be accessed once 'error' is deleted.
+///
+/// \param error The error object.
+/// \return The string representation of the error code.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorCodeString(
+    TRITONSERVER_Error* error);
+
+/// Get the error message. The returned string is not owned by the
+/// caller and so should not be modified or freed. The lifetime of the
+/// returned string extends only as long as 'error' and must not be
+/// accessed once 'error' is deleted.
+///
+/// \param error The error object.
+/// \return The error message.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_ErrorMessage(
+    TRITONSERVER_Error* error);
+
+/// TRITONSERVER_ResponseAllocator
+///
+/// Object representing a memory allocator for output tensors in an
+/// inference response.
+///
+
+/// Type for allocation function that allocates a buffer to hold an
+/// output tensor.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor to allocate for.
+/// \param byte_size The size of the buffer to allocate.
+/// \param memory_type The type of memory that the caller prefers for
+/// the buffer allocation.
+/// \param memory_type_id The ID of the memory that the caller prefers
+/// for the buffer allocation.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer Returns a pointer to the allocated memory.
+/// \param buffer_userp Returns a user-specified value to associate
+/// with the buffer, or nullptr if no user-specified value should be
+/// associated with the buffer. This value will be provided in the
+/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
+/// is released and will also be returned by
+/// TRITONSERVER_InferenceResponseOutput.
+/// \param actual_memory_type Returns the type of memory where the
+/// allocation resides. May be different than the type of memory
+/// requested by 'memory_type'.
+/// \param actual_memory_type_id Returns the ID of the memory where
+/// the allocation resides. May be different than the ID of the memory
+/// requested by 'memory_type_id'.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting an allocation. If an error is returned all other return
+/// values will be ignored.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorAllocFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, void* userp, void** buffer, void** buffer_userp,
+    TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id);
+
+/// Type for allocation function that allocates a buffer to hold an
+/// output tensor with buffer attributes. The callback function must fill in the
+/// appropriate buffer attributes information related to this buffer. If set,
+/// this function is always called after TRITONSERVER_ResponseAllocatorAllocFn_t
+/// function.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor to allocate for.
+/// \param buffer_attributes The buffer attributes associated with the buffer.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer_userp Returns a user-specified value to associate
+/// with the buffer, or nullptr if no user-specified value should be
+/// associated with the buffer. This value will be provided in the
+/// call to TRITONSERVER_ResponseAllocatorReleaseFn_t when the buffer
+/// is released and will also be returned by
+/// TRITONSERVER_InferenceResponseOutput.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting an allocation. If an error is returned all other return
+/// values will be ignored.
+typedef TRITONSERVER_Error* (
+    *TRITONSERVER_ResponseAllocatorBufferAttributesFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    TRITONSERVER_BufferAttributes* buffer_attributes, void* userp,
+    void* buffer_userp);
+
+/// Type for function that is called to query the allocator's preferred memory
+/// type and memory type ID. As much as possible, the allocator should attempt
+/// to return the same memory_type and memory_type_id values that will be
+/// returned by the subsequent call to TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// But the allocator is not required to do so.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param tensor_name The name of the output tensor. This is optional
+/// and it should be set to nullptr to indicate that the tensor name has
+/// not determined.
+/// \param byte_size The expected size of the buffer. This is optional
+/// and it should be set to nullptr to indicate that the byte size has
+/// not determined.
+/// \param memory_type Acts as both input and output. On input gives
+/// the memory type preferred by the caller. Returns memory type preferred
+/// by the allocator, taken account of the caller preferred type.
+/// \param memory_type_id Acts as both input and output. On input gives
+/// the memory type ID preferred by the caller. Returns memory type ID preferred
+/// by the allocator, taken account of the caller preferred type ID.
+/// \return a TRITONSERVER_Error object if a failure occurs.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorQueryFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* userp,
+    const char* tensor_name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+
+/// Type for function that is called when the server no longer holds
+/// any reference to a buffer allocated by
+/// TRITONSERVER_ResponseAllocatorAllocFn_t. In practice this function
+/// is typically called when the response object associated with the
+/// buffer is deleted by TRITONSERVER_InferenceResponseDelete.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param buffer Pointer to the buffer to be freed.
+/// \param buffer_userp The user-specified value associated
+/// with the buffer in TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// \param byte_size The size of the buffer.
+/// \param memory_type The type of memory holding the buffer.
+/// \param memory_type_id The ID of the memory holding the buffer.
+/// \return a TRITONSERVER_Error object if a failure occurs while
+/// attempting the release. If an error is returned Triton will not
+/// attempt to release the buffer again.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorReleaseFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id);
+
+/// Type for function that is called to indicate that subsequent
+/// allocation requests will refer to a new response.
+///
+/// \param allocator The allocator that is provided in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \param userp The user data pointer that is provided as
+/// 'response_allocator_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+/// \return a TRITONSERVER_Error object if a failure occurs.
+typedef TRITONSERVER_Error* (*TRITONSERVER_ResponseAllocatorStartFn_t)(
+    TRITONSERVER_ResponseAllocator* allocator, void* userp);
+
+/// Create a new response allocator object.
+///
+/// The response allocator object is used by Triton to allocate
+/// buffers to hold the output tensors in inference responses. Most
+/// models generate a single response for each inference request
+/// (TRITONSERVER_TXN_ONE_TO_ONE). For these models the order of
+/// callbacks will be:
+///
+///   TRITONSERVER_ServerInferAsync called
+///    - start_fn : optional (and typically not required)
+///    - alloc_fn : called once for each output tensor in response
+///   TRITONSERVER_InferenceResponseDelete called
+///    - release_fn: called once for each output tensor in response
+///
+/// For models that generate multiple responses for each inference
+/// request (TRITONSERVER_TXN_DECOUPLED), the start_fn callback can be
+/// used to determine sets of alloc_fn callbacks that belong to the
+/// same response:
+///
+///   TRITONSERVER_ServerInferAsync called
+///    - start_fn
+///    - alloc_fn : called once for each output tensor in response
+///    - start_fn
+///    - alloc_fn : called once for each output tensor in response
+///      ...
+///   For each response, TRITONSERVER_InferenceResponseDelete called
+///    - release_fn: called once for each output tensor in the response
+///
+/// In all cases the start_fn, alloc_fn and release_fn callback
+/// functions must be thread-safe. Typically making these functions
+/// thread-safe does not require explicit locking. The recommended way
+/// to implement these functions is to have each inference request
+/// provide a 'response_allocator_userp' object that is unique to that
+/// request with TRITONSERVER_InferenceRequestSetResponseCallback. The
+/// callback functions then operate only on this unique state. Locking
+/// is required only when the callback function needs to access state
+/// that is shared across inference requests (for example, a common
+/// allocation pool).
+///
+/// \param allocator Returns the new response allocator object.
+/// \param alloc_fn The function to call to allocate buffers for result
+/// tensors.
+/// \param release_fn The function to call when the server no longer
+/// holds a reference to an allocated buffer.
+/// \param start_fn The function to call to indicate that the
+/// subsequent 'alloc_fn' calls are for a new response. This callback
+/// is optional (use nullptr to indicate that it should not be
+/// invoked).
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorNew(
+    TRITONSERVER_ResponseAllocator** allocator,
+    TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
+    TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
+    TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
+
+/// Set the buffer attributes function for a response allocator object.
+/// The function will be called after alloc_fn to set the buffer attributes
+/// associated with the output buffer.
+///
+/// The thread-safy requirement for buffer_attributes_fn is the same as other
+/// allocator callbacks.
+///
+/// \param allocator The response allocator object.
+/// \param buffer_attributes_fn The function to call to get the buffer
+/// attributes information for an allocated buffer.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ResponseAllocatorSetBufferAttributesFunction(
+    TRITONSERVER_ResponseAllocator* allocator,
+    TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn);
+
+/// Set the query function to a response allocator object. Usually the
+/// function will be called before alloc_fn to understand what is the
+/// allocator's preferred memory type and memory type ID at the current
+/// situation to make different execution decision.
+///
+/// The thread-safy requirement for query_fn is the same as other allocator
+/// callbacks.
+///
+/// \param allocator The response allocator object.
+/// \param query_fn The function to call to query allocator's preferred memory
+/// type and memory type ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ResponseAllocatorSetQueryFunction(
+    TRITONSERVER_ResponseAllocator* allocator,
+    TRITONSERVER_ResponseAllocatorQueryFn_t query_fn);
+
+/// Delete a response allocator.
+///
+/// \param allocator The response allocator object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ResponseAllocatorDelete(
+    TRITONSERVER_ResponseAllocator* allocator);
+
+/// TRITONSERVER_Message
+///
+/// Object representing a Triton Server message.
+///
+
+/// Create a new message object from serialized JSON string.
+///
+/// \param message The message object.
+/// \param base The base of the serialized JSON.
+/// \param byte_size The size, in bytes, of the serialized message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_MessageNewFromSerializedJson(
+    TRITONSERVER_Message** message, const char* base, size_t byte_size);
+
+/// Delete a message object.
+///
+/// \param message The message object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageDelete(
+    TRITONSERVER_Message* message);
+
+/// Get the base and size of the buffer containing the serialized
+/// message in JSON format. The buffer is owned by the
+/// TRITONSERVER_Message object and should not be modified or freed by
+/// the caller. The lifetime of the buffer extends only as long as
+/// 'message' and must not be accessed once 'message' is deleted.
+///
+/// \param message The message object.
+/// \param base Returns the base of the serialized message.
+/// \param byte_size Returns the size, in bytes, of the serialized
+/// message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MessageSerializeToJson(
+    TRITONSERVER_Message* message, const char** base, size_t* byte_size);
+
+/// TRITONSERVER_Metrics
+///
+/// Object representing metrics.
+///
+
+/// Metric format types
+typedef enum tritonserver_metricformat_enum {
+  TRITONSERVER_METRIC_PROMETHEUS
+} TRITONSERVER_MetricFormat;
+
+/// Delete a metrics object.
+///
+/// \param metrics The metrics object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsDelete(
+    TRITONSERVER_Metrics* metrics);
+
+/// Get a buffer containing the metrics in the specified format. For
+/// each format the buffer contains the following:
+///
+///   TRITONSERVER_METRIC_PROMETHEUS: 'base' points to a single multiline
+///   string (char*) that gives a text representation of the metrics in
+///   prometheus format. 'byte_size' returns the length of the string
+///   in bytes.
+///
+/// The buffer is owned by the 'metrics' object and should not be
+/// modified or freed by the caller. The lifetime of the buffer
+/// extends only as long as 'metrics' and must not be accessed once
+/// 'metrics' is deleted.
+///
+/// \param metrics The metrics object.
+/// \param format The format to use for the returned metrics.
+/// \param base Returns a pointer to the base of the formatted
+/// metrics, as described above.
+/// \param byte_size Returns the size, in bytes, of the formatted
+/// metrics.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricsFormatted(
+    TRITONSERVER_Metrics* metrics, TRITONSERVER_MetricFormat format,
+    const char** base, size_t* byte_size);
+
+/// TRITONSERVER_InferenceTrace
+///
+/// Object that represents tracing for an inference request.
+///
+
+/// Trace levels. The trace level controls the type of trace
+/// activities that are reported for an inference request.
+///
+/// Trace level values are power-of-2 and can be combined to trace
+/// multiple types of activities. For example, use
+/// (TRITONSERVER_TRACE_LEVEL_TIMESTAMPS |
+/// TRITONSERVER_TRACE_LEVEL_TENSORS) to trace both timestamps and
+/// tensors for an inference request.
+///
+/// TRITONSERVER_TRACE_LEVEL_MIN and TRITONSERVER_TRACE_LEVEL_MAX are
+/// deprecated and should not be used.
+typedef enum tritonserver_tracelevel_enum {
+  /// Tracing disabled. No trace activities are reported.
+  TRITONSERVER_TRACE_LEVEL_DISABLED = 0,
+  /// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
+  TRITONSERVER_TRACE_LEVEL_MIN = 1,
+  /// Deprecated. Use TRITONSERVER_TRACE_LEVEL_TIMESTAMPS.
+  TRITONSERVER_TRACE_LEVEL_MAX = 2,
+  /// Record timestamps for the inference request.
+  TRITONSERVER_TRACE_LEVEL_TIMESTAMPS = 0x4,
+  /// Record input and output tensor values for the inference request.
+  TRITONSERVER_TRACE_LEVEL_TENSORS = 0x8
+} TRITONSERVER_InferenceTraceLevel;
+
+/// Get the string representation of a trace level. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param level The trace level.
+/// \return The string representation of the trace level.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceLevelString(
+    TRITONSERVER_InferenceTraceLevel level);
+
+/// Trace activities
+typedef enum tritonserver_traceactivity_enum {
+  TRITONSERVER_TRACE_REQUEST_START = 0,
+  TRITONSERVER_TRACE_QUEUE_START = 1,
+  TRITONSERVER_TRACE_COMPUTE_START = 2,
+  TRITONSERVER_TRACE_COMPUTE_INPUT_END = 3,
+  TRITONSERVER_TRACE_COMPUTE_OUTPUT_START = 4,
+  TRITONSERVER_TRACE_COMPUTE_END = 5,
+  TRITONSERVER_TRACE_REQUEST_END = 6,
+  TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT = 7,
+  TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT = 8,
+  TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT = 9
+} TRITONSERVER_InferenceTraceActivity;
+
+/// Get the string representation of a trace activity. The returned
+/// string is not owned by the caller and so should not be modified or
+/// freed.
+///
+/// \param activity The trace activity.
+/// \return The string representation of the trace activity.
+TRITONSERVER_DECLSPEC const char* TRITONSERVER_InferenceTraceActivityString(
+    TRITONSERVER_InferenceTraceActivity activity);
+
+/// Type for trace timeline activity callback function. This callback function
+/// is used to report activity occurring for a trace. This function
+/// does not take ownership of 'trace' and so any information needed
+/// from that object must be copied before returning. The 'userp' data
+/// is the same as what is supplied in the call to
+/// TRITONSERVER_InferenceTraceNew.
+typedef void (*TRITONSERVER_InferenceTraceActivityFn_t)(
+    TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
+    void* userp);
+
+/// Type for trace tensor activity callback function. This callback function
+/// is used to report tensor activity occurring for a trace. This function
+/// does not take ownership of 'trace' and so any information needed
+/// from that object must be copied before returning. The 'userp' data
+/// is the same as what is supplied in the call to
+/// TRITONSERVER_InferenceTraceTensorNew.
+typedef void (*TRITONSERVER_InferenceTraceTensorActivityFn_t)(
+    TRITONSERVER_InferenceTrace* trace,
+    TRITONSERVER_InferenceTraceActivity activity, const char* name,
+    TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
+    const int64_t* shape, uint64_t dim_count,
+    TRITONSERVER_MemoryType memory_type, int64_t memory_type_id, void* userp);
+
+/// Type for trace release callback function. This callback function
+/// is called when all activity for the trace has completed. The
+/// callback function takes ownership of the
+/// TRITONSERVER_InferenceTrace object. The 'userp' data is the same
+/// as what is supplied in the call to TRITONSERVER_InferenceTraceNew.
+typedef void (*TRITONSERVER_InferenceTraceReleaseFn_t)(
+    TRITONSERVER_InferenceTrace* trace, void* userp);
+
+/// Create a new inference trace object. The caller takes ownership of
+/// the TRITONSERVER_InferenceTrace object and must call
+/// TRITONSERVER_InferenceTraceDelete to release the object.
+///
+/// The activity callback function will be called to report activity
+/// for 'trace' as well as for any child traces that are spawned by
+/// 'trace', and so the activity callback must check the trace object
+/// to determine specifically what activity is being reported.
+///
+/// The release callback is called for both 'trace' and for any child
+/// traces spawned by 'trace'.
+///
+/// \param trace Returns the new inference trace object.
+/// \param level The tracing level.
+/// \param parent_id The parent trace id for this trace. A value of 0
+/// indicates that there is not parent trace.
+/// \param activity_fn The callback function where activity for the
+/// trace is reported.
+/// \param release_fn The callback function called when all activity
+/// is complete for the trace.
+/// \param trace_userp User-provided pointer that is delivered to
+/// the activity and release callback functions.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceNew(
+    TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
+    uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
+    TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
+
+/// Create a new inference trace object. The caller takes ownership of
+/// the TRITONSERVER_InferenceTrace object and must call
+/// TRITONSERVER_InferenceTraceDelete to release the object.
+///
+/// The timeline and tensor activity callback function will be called to report
+/// activity for 'trace' as well as for any child traces that are spawned by
+/// 'trace', and so the activity callback must check the trace object
+/// to determine specifically what activity is being reported.
+///
+/// The release callback is called for both 'trace' and for any child
+/// traces spawned by 'trace'.
+///
+/// \param trace Returns the new inference trace object.
+/// \param level The tracing level.
+/// \param parent_id The parent trace id for this trace. A value of 0
+/// indicates that there is not parent trace.
+/// \param activity_fn The callback function where timeline activity for the
+/// trace is reported.
+/// \param tensor_activity_fn The callback function where tensor activity for
+/// the trace is reported.
+/// \param release_fn The callback function called when all activity
+/// is complete for the trace.
+/// \param trace_userp User-provided pointer that is delivered to
+/// the activity and release callback functions.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceTensorNew(
+    TRITONSERVER_InferenceTrace** trace, TRITONSERVER_InferenceTraceLevel level,
+    uint64_t parent_id, TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
+    TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
+    TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* trace_userp);
+
+/// Delete a trace object.
+///
+/// \param trace The trace object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceDelete(
+    TRITONSERVER_InferenceTrace* trace);
+
+/// Get the id associated with a trace. Every trace is assigned an id
+/// that is unique across all traces created for a Triton server.
+///
+/// \param trace The trace.
+/// \param id Returns the id associated with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceId(
+    TRITONSERVER_InferenceTrace* trace, uint64_t* id);
+
+/// Get the parent id associated with a trace. The parent id indicates
+/// a parent-child relationship between two traces. A parent id value
+/// of 0 indicates that there is no parent trace.
+///
+/// \param trace The trace.
+/// \param id Returns the parent id associated with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceParentId(
+    TRITONSERVER_InferenceTrace* trace, uint64_t* parent_id);
+
+/// Get the name of the model associated with a trace. The caller does
+/// not own the returned string and must not modify or delete it. The
+/// lifetime of the returned string extends only as long as 'trace'.
+///
+/// \param trace The trace.
+/// \param model_name Returns the name of the model associated with
+/// the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceTraceModelName(
+    TRITONSERVER_InferenceTrace* trace, const char** model_name);
+
+/// Get the version of the model associated with a trace.
+///
+/// \param trace The trace.
+/// \param model_version Returns the version of the model associated
+/// with the trace.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceTraceModelVersion(
+    TRITONSERVER_InferenceTrace* trace, int64_t* model_version);
+
+/// TRITONSERVER_InferenceRequest
+///
+/// Object representing an inference request. The inference request
+/// provides the meta-data and input tensor values needed for an
+/// inference and returns the inference result meta-data and output
+/// tensors. An inference request object can be modified and reused
+/// multiple times.
+///
+
+/// Inference request flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_requestflag_enum {
+  TRITONSERVER_REQUEST_FLAG_SEQUENCE_START = 1,
+  TRITONSERVER_REQUEST_FLAG_SEQUENCE_END = 2
+} TRITONSERVER_RequestFlag;
+
+/// Inference request release flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_requestreleaseflag_enum {
+  TRITONSERVER_REQUEST_RELEASE_ALL = 1
+} TRITONSERVER_RequestReleaseFlag;
+
+/// Inference response complete flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_responsecompleteflag_enum {
+  TRITONSERVER_RESPONSE_COMPLETE_FINAL = 1
+} TRITONSERVER_ResponseCompleteFlag;
+
+/// Type for inference request release callback function. The callback
+/// indicates what type of release is being performed on the request
+/// and for some of these the callback function takes ownership of the
+/// TRITONSERVER_InferenceRequest object. The 'userp' data is the data
+/// provided as 'request_release_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetReleaseCallback.
+///
+/// One or more flags will be specified when the callback is invoked,
+/// and the callback must take the following actions:
+///
+///   - TRITONSERVER_REQUEST_RELEASE_ALL: The entire inference request
+///     is being released and ownership is passed to the callback
+///     function. Triton will not longer access the 'request' object
+///     itself nor any input tensor data associated with the
+///     request. The callback should free or otherwise manage the
+///     'request' object and all associated tensor data.
+///
+/// Note that currently TRITONSERVER_REQUEST_RELEASE_ALL should always
+/// be set when the callback is invoked but in the future that may
+/// change, so the callback should explicitly check for the flag
+/// before taking ownership of the request object.
+///
+typedef void (*TRITONSERVER_InferenceRequestReleaseFn_t)(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);
+
+/// Type for callback function indicating that an inference response
+/// has completed. The callback function takes ownership of the
+/// TRITONSERVER_InferenceResponse object. The 'userp' data is the
+/// data provided as 'response_userp' in the call to
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+///
+/// One or more flags may be specified when the callback is invoked:
+///
+///   - TRITONSERVER_RESPONSE_COMPLETE_FINAL: Indicates that no more
+///     responses will be generated for a given request (more
+///     specifically, that no more responses will be generated for the
+///     inference request that set this callback and 'userp'). When
+///     this flag is set 'response' may be a response object or may be
+///     nullptr. If 'response' is not nullptr, then 'response' is the
+///     last response that Triton will produce for the request. If
+///     'response' is nullptr then Triton is indicating that no more
+///     responses will be produced for the request.
+typedef void (*TRITONSERVER_InferenceResponseCompleteFn_t)(
+    TRITONSERVER_InferenceResponse* response, const uint32_t flags,
+    void* userp);
+
+/// Create a new inference request object.
+///
+/// \param inference_request Returns the new request object.
+/// \param server the inference server object.
+/// \param model_name The name of the model to use for the request.
+/// \param model_version The version of the model to use for the
+/// request. If -1 then the server will choose a version based on the
+/// model's policy.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestNew(
+    TRITONSERVER_InferenceRequest** inference_request,
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version);
+
+/// Delete an inference request object.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestDelete(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Get the ID for a request. The returned ID is owned by
+/// 'inference_request' and must not be modified or freed by the
+/// caller.
+///
+/// \param inference_request The request object.
+/// \param id Returns the ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestId(
+    TRITONSERVER_InferenceRequest* inference_request, const char** id);
+
+/// Set the ID for a request.
+///
+/// \param inference_request The request object.
+/// \param id The ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetId(
+    TRITONSERVER_InferenceRequest* inference_request, const char* id);
+
+/// Get the flag(s) associated with a request. On return 'flags' holds
+/// a bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
+/// available flags.
+///
+/// \param inference_request The request object.
+/// \param flags Returns the flags.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestFlags(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t* flags);
+
+/// Set the flag(s) associated with a request. 'flags' should hold a
+/// bitwise-or of all flag values, see TRITONSERVER_RequestFlag for
+/// available flags.
+///
+/// \param inference_request The request object.
+/// \param flags The flags.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestSetFlags(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
+
+/// Get the correlation ID of the inference request as an unsigned integer.
+/// Default is 0, which indicates that the request has no correlation ID.
+/// If the correlation id associated with the inference request is a string,
+/// this function will return a failure. The correlation ID is used
+/// to indicate two or more inference request are related to each other.
+/// How this relationship is handled by the inference server is determined by
+/// the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestCorrelationId(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t* correlation_id);
+
+/// Get the correlation ID of the inference request as a string.
+/// Default is empty "", which indicates that the request has no correlation ID.
+/// If the correlation id associated with the inference request is an unsigned
+/// integer, then this function will return a failure. The correlation ID
+/// is used to indicate two or more inference request are related to each other.
+/// How this relationship is handled by the inference server is determined by
+/// the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id Returns the correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestCorrelationIdString(
+    TRITONSERVER_InferenceRequest* inference_request,
+    const char** correlation_id);
+
+/// Set the correlation ID of the inference request to be an unsigned integer.
+/// Default is 0, which indicates that the request has no correlation ID.
+/// The correlation ID is used to indicate two or more inference request
+/// are related to each other. How this relationship is handled by the
+/// inference server is determined by the model's scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id The correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetCorrelationId(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t correlation_id);
+
+/// Set the correlation ID of the inference request to be a string.
+/// The correlation ID is used to indicate two or more inference
+/// request are related to each other. How this relationship is
+/// handled by the inference server is determined by the model's
+/// scheduling policy.
+///
+/// \param inference_request The request object.
+/// \param correlation_id The correlation ID.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetCorrelationIdString(
+    TRITONSERVER_InferenceRequest* inference_request,
+    const char* correlation_id);
+
+/// Get the priority for a request. The default is 0 indicating that
+/// the request does not specify a priority and so will use the
+/// model's default priority.
+///
+/// \param inference_request The request object.
+/// \param priority Returns the priority level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestPriority(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t* priority);
+
+/// Set the priority for a request. The default is 0 indicating that
+/// the request does not specify a priority and so will use the
+/// model's default priority.
+///
+/// \param inference_request The request object.
+/// \param priority The priority level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetPriority(
+    TRITONSERVER_InferenceRequest* inference_request, uint32_t priority);
+
+/// Get the timeout for a request, in microseconds. The default is 0
+/// which indicates that the request has no timeout.
+///
+/// \param inference_request The request object.
+/// \param timeout_us Returns the timeout, in microseconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestTimeoutMicroseconds(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t* timeout_us);
+
+/// Set the timeout for a request, in microseconds. The default is 0
+/// which indicates that the request has no timeout.
+///
+/// \param inference_request The request object.
+/// \param timeout_us The timeout, in microseconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetTimeoutMicroseconds(
+    TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
+
+/// Add an input to a request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param datatype The type of the input. Valid type names are BOOL,
+/// UINT8, UINT16, UINT32, UINT64, INT8, INT16, INT32, INT64, FP16,
+/// FP32, FP64, and BYTES.
+/// \param shape The shape of the input.
+/// \param dim_count The number of dimensions of 'shape'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceRequestAddInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const TRITONSERVER_DataType datatype, const int64_t* shape,
+    uint64_t dim_count);
+
+/// Add a raw input to a request. The name recognized by the model, data type
+/// and shape of the input will be deduced from model configuration.
+/// This function must be called at most once on request with no other input to
+/// ensure the deduction is accurate.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input. This name is only used as a reference
+/// of the raw input in other Tritonserver APIs. It doesn't assoicate with the
+/// name used in the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAddRawInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove an input from a request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveInput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove all inputs from a request.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllInputs(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Assign a buffer of data to an input. The buffer will be appended
+/// to any existing buffers for that input. The 'inference_request'
+/// object takes ownership of the buffer and so the caller should not
+/// modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed
+/// from 'inference_request'.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param byte_size The size, in bytes, of the input data.
+/// \param memory_type The memory type of the input data.
+/// \param memory_type_id The memory type id of the input data.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputData(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id);
+
+/// Assign a buffer of data to an input for execution on all model instances
+/// with the specified host policy. The buffer will be appended to any existing
+/// buffers for that input on all devices with this host policy. The
+/// 'inference_request' object takes ownership of the buffer and so the caller
+/// should not modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed from
+/// 'inference_request'. If the execution is scheduled on a device that does not
+/// have a input buffer specified using this function, then the input buffer
+/// specified with TRITONSERVER_InferenceRequestAppendInputData will be used so
+/// a non-host policy specific version of data must be added using that API.
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param byte_size The size, in bytes, of the input data.
+/// \param memory_type The memory type of the input data.
+/// \param memory_type_id The memory type id of the input data.
+/// \param host_policy_name All model instances executing with this host_policy
+/// will use this input buffer for execution.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, const char* host_policy_name);
+
+/// Assign a buffer of data to an input. The buffer will be appended
+/// to any existing buffers for that input. The 'inference_request'
+/// object takes ownership of the buffer and so the caller should not
+/// modify or free the buffer until that ownership is released by
+/// 'inference_request' being deleted or by the input being removed
+/// from 'inference_request'.
+///
+/// \param inference_request The request object.
+/// \param name The name of the input.
+/// \param base The base address of the input data.
+/// \param buffer_attributes The buffer attrubutes of the input.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAppendInputDataWithBufferAttributes(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name,
+    const void* base, TRITONSERVER_BufferAttributes* buffer_attributes);
+
+/// Clear all input data from an input, releasing ownership of the
+/// buffer(s) that were appended to the input with
+/// TRITONSERVER_InferenceRequestAppendInputData or
+/// TRITONSERVER_InferenceRequestAppendInputDataWithHostPolicy
+/// \param inference_request The request object.
+/// \param name The name of the input.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllInputData(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Add an output request to an inference request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the output.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestAddRequestedOutput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove an output request from an inference request.
+///
+/// \param inference_request The request object.
+/// \param name The name of the output.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveRequestedOutput(
+    TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+/// Remove all output requests from an inference request.
+///
+/// \param inference_request The request object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestRemoveAllRequestedOutputs(
+    TRITONSERVER_InferenceRequest* inference_request);
+
+/// Set the release callback for an inference request. The release
+/// callback is called by Triton to return ownership of the request
+/// object.
+///
+/// \param inference_request The request object.
+/// \param request_release_fn The function called to return ownership
+/// of the 'inference_request' object.
+/// \param request_release_userp User-provided pointer that is
+/// delivered to the 'request_release_fn' callback.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetReleaseCallback(
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
+    void* request_release_userp);
+
+/// Set the allocator and response callback for an inference
+/// request. The allocator is used to allocate buffers for any output
+/// tensors included in responses that are produced for this
+/// request. The response callback is called to return response
+/// objects representing responses produced for this request.
+///
+/// \param inference_request The request object.
+/// \param response_allocator The TRITONSERVER_ResponseAllocator to use
+/// to allocate buffers to hold inference results.
+/// \param response_allocator_userp User-provided pointer that is
+/// delivered to the response allocator's start and allocation functions.
+/// \param response_fn The function called to deliver an inference
+/// response for this request.
+/// \param response_userp User-provided pointer that is delivered to
+/// the 'response_fn' callback.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceRequestSetResponseCallback(
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_ResponseAllocator* response_allocator,
+    void* response_allocator_userp,
+    TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
+    void* response_userp);
+
+/// TRITONSERVER_InferenceResponse
+///
+/// Object representing an inference response. The inference response
+/// provides the meta-data and output tensor values calculated by the
+/// inference.
+///
+
+/// Delete an inference response object.
+///
+/// \param inference_response The response object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseDelete(
+    TRITONSERVER_InferenceResponse* inference_response);
+
+/// Return the error status of an inference response. Return a
+/// TRITONSERVER_Error object on failure, return nullptr on success.
+/// The returned error object is owned by 'inference_response' and so
+/// should not be deleted by the caller.
+///
+/// \param inference_response The response object.
+/// \return a TRITONSERVER_Error indicating the success or failure
+/// status of the response.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseError(
+    TRITONSERVER_InferenceResponse* inference_response);
+
+/// Get model used to produce a response. The caller does not own the
+/// returned model name value and must not modify or delete it. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// \param inference_response The response object.
+/// \param model_name Returns the name of the model.
+/// \param model_version Returns the version of the model.
+/// this response.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseModel(
+    TRITONSERVER_InferenceResponse* inference_response, const char** model_name,
+    int64_t* model_version);
+
+/// Get the ID of the request corresponding to a response. The caller
+/// does not own the returned ID and must not modify or delete it. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// \param inference_response The response object.
+/// \param request_id Returns the ID of the request corresponding to
+/// this response.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseId(
+    TRITONSERVER_InferenceResponse* inference_response,
+    const char** request_id);
+
+/// Get the number of parameters available in the response.
+///
+/// \param inference_response The response object.
+/// \param count Returns the number of parameters.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseParameterCount(
+    TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
+
+/// Get all information about a parameter. The caller does not own any
+/// of the returned values and must not modify or delete them. The
+/// lifetime of all returned values extends until 'inference_response'
+/// is deleted.
+///
+/// The 'vvalue' returns a void* pointer that must be cast
+/// appropriately based on 'type'. For example:
+///
+///   void* vvalue;
+///   TRITONSERVER_ParameterType type;
+///   TRITONSERVER_InferenceResponseParameter(
+///                     response, index, &name, &type, &vvalue);
+///   switch (type) {
+///     case TRITONSERVER_PARAMETER_BOOL:
+///       bool value = *(reinterpret_cast<bool*>(vvalue));
+///       ...
+///     case TRITONSERVER_PARAMETER_INT:
+///       int64_t value = *(reinterpret_cast<int64_t*>(vvalue));
+///       ...
+///     case TRITONSERVER_PARAMETER_STRING:
+///       const char* value = reinterpret_cast<const char*>(vvalue);
+///       ...
+///
+/// \param inference_response The response object.
+/// \param index The index of the parameter, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseParameterCount.
+/// \param name Returns the name of the parameter.
+/// \param type Returns the type of the parameter.
+/// \param vvalue Returns a pointer to the parameter value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseParameter(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const char** name, TRITONSERVER_ParameterType* type, const void** vvalue);
+
+/// Get the number of outputs available in the response.
+///
+/// \param inference_response The response object.
+/// \param count Returns the number of output tensors.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseOutputCount(
+    TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
+
+/// Get all information about an output tensor.  The tensor data is
+/// returned as the base pointer to the data and the size, in bytes,
+/// of the data. The caller does not own any of the returned values
+/// and must not modify or delete them. The lifetime of all returned
+/// values extends until 'inference_response' is deleted.
+///
+/// \param inference_response The response object.
+/// \param index The index of the output tensor, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseOutputCount.
+/// \param name Returns the name of the output.
+/// \param datatype Returns the type of the output.
+/// \param shape Returns the shape of the output.
+/// \param dim_count Returns the number of dimensions of the returned
+/// shape.
+/// \param base Returns the tensor data for the output.
+/// \param byte_size Returns the size, in bytes, of the data.
+/// \param memory_type Returns the memory type of the data.
+/// \param memory_type_id Returns the memory type id of the data.
+/// \param userp The user-specified value associated with the buffer
+/// in TRITONSERVER_ResponseAllocatorAllocFn_t.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_InferenceResponseOutput(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint64_t* dim_count, const void** base, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
+    void** userp);
+
+/// Get a classification label associated with an output for a given
+/// index.  The caller does not own the returned label and must not
+/// modify or delete it. The lifetime of all returned label extends
+/// until 'inference_response' is deleted.
+///
+/// \param inference_response The response object.
+/// \param index The index of the output tensor, must be 0 <= index <
+/// count, where 'count' is the value returned by
+/// TRITONSERVER_InferenceResponseOutputCount.
+/// \param class_index The index of the class.
+/// \param name Returns the label corresponding to 'class_index' or
+/// nullptr if no label.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_InferenceResponseOutputClassificationLabel(
+    TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+    const size_t class_index, const char** label);
+
+/// TRITONSERVER_BufferAttributes
+///
+/// API to create, modify, or retrieve attributes associated with a buffer.
+///
+
+/// Create a new buffer attributes object. The caller takes ownership of
+/// the TRITONSERVER_BufferAttributes object and must call
+/// TRITONSERVER_BufferAttributesDelete to release the object.
+///
+/// \param buffer_attributes Returns the new buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesNew(
+    TRITONSERVER_BufferAttributes** buffer_attributes);
+
+/// Delete a buffer attributes object.
+///
+/// \param buffer_attributes The buffer_attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesDelete(
+    TRITONSERVER_BufferAttributes* buffer_attributes);
+
+/// Set the memory type id field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type_id Memory type id to assign to the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetMemoryTypeId(
+    TRITONSERVER_BufferAttributes* buffer_attributes, int64_t memory_type_id);
+
+/// Set the memory type field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type Memory type to assign to the buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetMemoryType(
+    TRITONSERVER_BufferAttributes* buffer_attributes,
+    TRITONSERVER_MemoryType memory_type);
+
+/// Set the CudaIpcHandle field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param cuda_ipc_handle The CudaIpcHandle to assign to the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetCudaIpcHandle(
+    TRITONSERVER_BufferAttributes* buffer_attributes, void* cuda_ipc_handle);
+
+/// Set the byte size field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param byte_size Byte size to assign to the buffer attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesSetByteSize(
+    TRITONSERVER_BufferAttributes* buffer_attributes, size_t byte_size);
+
+/// Get the memory type id field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type_id Returns the memory type id associated with the buffer
+/// attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesMemoryTypeId(
+    TRITONSERVER_BufferAttributes* buffer_attributes, int64_t* memory_type_id);
+
+/// Get the memory type field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param memory_type Returns the memory type associated with the buffer
+/// attributes object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesMemoryType(
+    TRITONSERVER_BufferAttributes* buffer_attributes,
+    TRITONSERVER_MemoryType* memory_type);
+
+/// Get the CudaIpcHandle field of the buffer attributes object.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param cuda_ipc_handle Returns the memory type associated with the buffer
+/// attributes object. If the cudaIpcHandle does not exist for the buffer,
+/// nullptr will be returned.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_BufferAttributesCudaIpcHandle(
+    TRITONSERVER_BufferAttributes* buffer_attributes, void** cuda_ipc_handle);
+
+/// Get the byte size field of the buffer attributes.
+///
+/// \param buffer_attributes The buffer attributes object.
+/// \param byte_size Returns the byte size associated with the buffer attributes
+/// object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_BufferAttributesByteSize(
+    TRITONSERVER_BufferAttributes* buffer_attributes, size_t* byte_size);
+
+
+/// TRITONSERVER_ServerOptions
+///
+/// Options to use when creating an inference server.
+///
+
+/// Model control modes
+typedef enum tritonserver_modelcontrolmode_enum {
+  TRITONSERVER_MODEL_CONTROL_NONE,
+  TRITONSERVER_MODEL_CONTROL_POLL,
+  TRITONSERVER_MODEL_CONTROL_EXPLICIT
+} TRITONSERVER_ModelControlMode;
+
+/// Rate limit modes
+typedef enum tritonserver_ratelimitmode_enum {
+  TRITONSERVER_RATE_LIMIT_OFF,
+  TRITONSERVER_RATE_LIMIT_EXEC_COUNT
+} TRITONSERVER_RateLimitMode;
+
+/// Create a new server options object. The caller takes ownership of
+/// the TRITONSERVER_ServerOptions object and must call
+/// TRITONSERVER_ServerOptionsDelete to release the object.
+///
+/// \param options Returns the new server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsNew(
+    TRITONSERVER_ServerOptions** options);
+
+/// Delete a server options object.
+///
+/// \param options The server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsDelete(
+    TRITONSERVER_ServerOptions* options);
+
+/// Set the textual ID for the server in a server options. The ID is a
+/// name that identifies the server.
+///
+/// \param options The server options object.
+/// \param server_id The server identifier.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetServerId(
+    TRITONSERVER_ServerOptions* options, const char* server_id);
+
+/// Set the model repository path in a server options. The path must be
+/// the full absolute path to the model repository. This function can be called
+/// multiple times with different paths to set multiple model repositories.
+/// Note that if a model is not unique across all model repositories
+/// at any time, the model will not be available.
+///
+/// \param options The server options object.
+/// \param model_repository_path The full path to the model repository.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelRepositoryPath(
+    TRITONSERVER_ServerOptions* options, const char* model_repository_path);
+
+/// Set the model control mode in a server options. For each mode the models
+/// will be managed as the following:
+///
+///   TRITONSERVER_MODEL_CONTROL_NONE: the models in model repository will be
+///   loaded on startup. After startup any changes to the model repository will
+///   be ignored. Calling TRITONSERVER_ServerPollModelRepository will result in
+///   an error.
+///
+///   TRITONSERVER_MODEL_CONTROL_POLL: the models in model repository will be
+///   loaded on startup. The model repository can be polled periodically using
+///   TRITONSERVER_ServerPollModelRepository and the server will load, unload,
+///   and updated models according to changes in the model repository.
+///
+///   TRITONSERVER_MODEL_CONTROL_EXPLICIT: the models in model repository will
+///   not be loaded on startup. The corresponding model control APIs must be
+///   called to load / unload a model in the model repository.
+///
+/// \param options The server options object.
+/// \param mode The mode to use for the model control.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelControlMode(
+    TRITONSERVER_ServerOptions* options, TRITONSERVER_ModelControlMode mode);
+
+/// Set the model to be loaded at startup in a server options. The model must be
+/// present in one, and only one, of the specified model repositories.
+/// This function can be called multiple times with different model name
+/// to set multiple startup models.
+/// Note that it only takes affect on TRITONSERVER_MODEL_CONTROL_EXPLICIT mode.
+///
+/// \param options The server options object.
+/// \param mode_name The name of the model to load on startup.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStartupModel(
+    TRITONSERVER_ServerOptions* options, const char* model_name);
+
+/// Enable or disable strict model configuration handling in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param strict True to enable strict model configuration handling,
+/// false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStrictModelConfig(
+    TRITONSERVER_ServerOptions* options, bool strict);
+
+/// Set the rate limit mode in a server options.
+///
+///   TRITONSERVER_RATE_LIMIT_EXEC_COUNT: The rate limiting prioritizes the
+///   inference execution using the number of times each instance has got a
+///   chance to run. The execution gets to run only when its resource
+///   constraints are satisfied.
+///
+///   TRITONSERVER_RATE_LIMIT_OFF: The rate limiting is turned off and the
+///   inference gets executed whenever an instance is available.
+///
+/// \param options The server options object.
+/// \param mode The mode to use for the rate limiting. By default, execution
+/// count is used to determine the priorities.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetRateLimiterMode(
+    TRITONSERVER_ServerOptions* options, TRITONSERVER_RateLimitMode mode);
+
+/// Add resource count for rate limiting.
+///
+/// \param options The server options object.
+/// \param name The name of the resource.
+/// \param count The count of the resource.
+/// \param device The device identifier for the resource. A value of -1
+/// indicates that the specified number of resources are available on every
+/// device. The device value is ignored for a global resource. The server
+/// will use the rate limiter configuration specified for instance groups
+/// in model config to determine whether resource is global. In case of
+/// conflicting resource type in different model configurations, server
+/// will raise an appropriate error while loading model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsAddRateLimiterResource(
+    TRITONSERVER_ServerOptions* options, const char* resource_name,
+    const size_t resource_count, const int device);
+
+/// Set the total pinned memory byte size that the server can allocate
+/// in a server options. The pinned memory pool will be shared across
+/// Triton itself and the backends that use
+/// TRITONBACKEND_MemoryManager to allocate memory.
+///
+/// \param options The server options object.
+/// \param size The pinned memory pool byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetPinnedMemoryPoolByteSize(
+    TRITONSERVER_ServerOptions* options, uint64_t size);
+
+/// Set the total CUDA memory byte size that the server can allocate
+/// on given GPU device in a server options. The pinned memory pool
+/// will be shared across Triton itself and the backends that use
+/// TRITONBACKEND_MemoryManager to allocate memory.
+///
+/// \param options The server options object.
+/// \param gpu_device The GPU device to allocate the memory pool.
+/// \param size The CUDA memory pool byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize(
+    TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
+
+/// Set the total response cache byte size that the server can allocate in CPU
+/// memory. The response cache will be shared across all inference requests and
+/// across all models.
+///
+/// \param options The server options object.
+/// \param size The total response cache byte size.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetResponseCacheByteSize(
+    TRITONSERVER_ServerOptions* options, uint64_t size);
+
+/// Set the minimum support CUDA compute capability in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param cc The minimum CUDA compute capability.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability(
+    TRITONSERVER_ServerOptions* options, double cc);
+
+/// Enable or disable exit-on-error in a server options.
+///
+/// \param options The server options object.
+/// \param exit True to enable exiting on intialization error, false
+/// to continue.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetExitOnError(
+    TRITONSERVER_ServerOptions* options, bool exit);
+
+/// Enable or disable strict readiness handling in a server options.
+///
+/// \param options The server options object.
+/// \param strict True to enable strict readiness handling, false to
+/// disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetStrictReadiness(
+    TRITONSERVER_ServerOptions* options, bool strict);
+
+/// Set the exit timeout, in seconds, for the server in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param timeout The exit timeout, in seconds.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetExitTimeout(
+    TRITONSERVER_ServerOptions* options, unsigned int timeout);
+
+/// Set the number of threads used in buffer manager in a server options.
+///
+/// \param options The server options object.
+/// \param thread_count The number of threads.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBufferManagerThreadCount(
+    TRITONSERVER_ServerOptions* options, unsigned int thread_count);
+
+/// Set the number of threads to concurrently load models in a server options.
+///
+/// \param options The server options object.
+/// \param thread_count The number of threads.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelLoadThreadCount(
+    TRITONSERVER_ServerOptions* options, unsigned int thread_count);
+
+/// Provide a log output file.
+///
+/// \param options The server options object.
+/// \param file a string defining the file where the log outputs will be saved.
+/// An empty string for the file name will cause triton to direct logging
+/// facilities to the console
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogFile(
+    TRITONSERVER_ServerOptions* options, const char* file);
+
+/// Enable or disable info level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable info logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogInfo(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Enable or disable warning level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable warning logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogWarn(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Enable or disable error level logging.
+///
+/// \param options The server options object.
+/// \param log True to enable error logging, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetLogError(
+    TRITONSERVER_ServerOptions* options, bool log);
+
+/// Set the logging format.
+///
+/// \param options The server options object.
+/// \param format The logging format.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetLogFormat(
+    TRITONSERVER_ServerOptions* options, const TRITONSERVER_LogFormat format);
+
+/// Set verbose logging level. Level zero disables verbose logging.
+///
+/// \param options The server options object.
+/// \param level The verbose logging level.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetLogVerbose(
+    TRITONSERVER_ServerOptions* options, int level);
+
+/// Enable or disable metrics collection in a server options.
+///
+/// \param options The server options object.
+/// \param metrics True to enable metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetMetrics(
+    TRITONSERVER_ServerOptions* options, bool metrics);
+
+/// Enable or disable GPU metrics collection in a server options. GPU
+/// metrics are collected if both this option and
+/// TRITONSERVER_ServerOptionsSetMetrics are true.
+///
+/// \param options The server options object.
+/// \param gpu_metrics True to enable GPU metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetGpuMetrics(
+    TRITONSERVER_ServerOptions* options, bool gpu_metrics);
+
+/// Enable or disable CPU metrics collection in a server options. CPU
+/// metrics are collected if both this option and
+/// TRITONSERVER_ServerOptionsSetMetrics are true.
+///
+/// \param options The server options object.
+/// \param cpu_metrics True to enable CPU metrics, false to disable.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetCpuMetrics(
+    TRITONSERVER_ServerOptions* options, bool cpu_metrics);
+
+/// Set the interval for metrics collection in a server options.
+/// This is 2000 milliseconds by default.
+///
+/// \param options The server options object.
+/// \param metrics_interval_ms The time interval in ms between
+/// successive metrics updates.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetMetricsInterval(
+    TRITONSERVER_ServerOptions* options, uint64_t metrics_interval_ms);
+
+/// Set the directory containing backend shared libraries. This
+/// directory is searched last after the version and model directory
+/// in the model repository when looking for the backend shared
+/// library for a model. If the backend is named 'be' the directory
+/// searched is 'backend_dir'/be/libtriton_be.so.
+///
+/// \param options The server options object.
+/// \param backend_dir The full path of the backend directory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBackendDirectory(
+    TRITONSERVER_ServerOptions* options, const char* backend_dir);
+
+/// Set the directory containing repository agent shared libraries. This
+/// directory is searched when looking for the repository agent shared
+/// library for a model. If the backend is named 'ra' the directory
+/// searched is 'repoagent_dir'/ra/libtritonrepoagent_ra.so.
+///
+/// \param options The server options object.
+/// \param repoagent_dir The full path of the repository agent directory.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetRepoAgentDirectory(
+    TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
+
+/// Specify the limit on memory usage as a fraction on the device identified by
+/// 'kind' and 'device_id'. If model loading on the device is requested and the
+/// current memory usage exceeds the limit, the load will be rejected. If not
+/// specified, the limit will not be set.
+///
+/// Currently support TRITONSERVER_INSTANCEGROUPKIND_GPU
+///
+/// \param options The server options object.
+/// \param kind The kind of the device.
+/// \param device_id The id of the device.
+/// \param fraction The limit on memory usage as a fraction
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetModelLoadDeviceLimit(
+    TRITONSERVER_ServerOptions* options,
+    const TRITONSERVER_InstanceGroupKind kind, const int device_id,
+    const double fraction);
+
+/// Set a configuration setting for a named backend in a server
+/// options.
+///
+/// \param options The server options object.
+/// \param backend_name The name of the backend.
+/// \param setting The name of the setting.
+/// \param value The setting value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetBackendConfig(
+    TRITONSERVER_ServerOptions* options, const char* backend_name,
+    const char* setting, const char* value);
+
+/// Set a host policy setting for a given policy name in a server options.
+///
+/// \param options The server options object.
+/// \param policy_name The name of the policy.
+/// \param setting The name of the setting.
+/// \param value The setting value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerOptionsSetHostPolicy(
+    TRITONSERVER_ServerOptions* options, const char* policy_name,
+    const char* setting, const char* value);
+
+/// TRITONSERVER_Server
+///
+/// An inference server.
+///
+
+/// Model batch flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_batchflag_enum {
+  TRITONSERVER_BATCH_UNKNOWN = 1,
+  TRITONSERVER_BATCH_FIRST_DIM = 2
+} TRITONSERVER_ModelBatchFlag;
+
+/// Model index flags. The enum values must be power-of-2 values.
+typedef enum tritonserver_modelindexflag_enum {
+  TRITONSERVER_INDEX_FLAG_READY = 1
+} TRITONSERVER_ModelIndexFlag;
+
+/// Model transaction policy flags. The enum values must be
+/// power-of-2 values.
+typedef enum tritonserver_txn_property_flag_enum {
+  TRITONSERVER_TXN_ONE_TO_ONE = 1,
+  TRITONSERVER_TXN_DECOUPLED = 2
+} TRITONSERVER_ModelTxnPropertyFlag;
+
+/// Create a new server object. The caller takes ownership of the
+/// TRITONSERVER_Server object and must call TRITONSERVER_ServerDelete
+/// to release the object.
+///
+/// \param server Returns the new inference server object.
+/// \param options The inference server options object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerNew(
+    TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* options);
+
+/// Delete a server object. If server is not already stopped it is
+/// stopped before being deleted.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerDelete(
+    TRITONSERVER_Server* server);
+
+/// Stop a server object. A server can't be restarted once it is
+/// stopped.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerStop(
+    TRITONSERVER_Server* server);
+
+/// Register a new model repository. Not available in polling mode.
+///
+/// \param server The inference server object.
+/// \param repository_path The full path to the model repository.
+/// \param name_mapping List of name_mapping parameters. Each mapping has
+/// the model directory name as its key, overriden model name as its value.
+/// \param model_count Number of mappings provided.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerRegisterModelRepository(
+    TRITONSERVER_Server* server, const char* repository_path,
+    const TRITONSERVER_Parameter** name_mapping, const uint32_t mapping_count);
+
+/// Unregister a model repository. Not available in polling mode.
+///
+/// \param server The inference server object.
+/// \param repository_path The full path to the model repository.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerUnregisterModelRepository(
+    TRITONSERVER_Server* server, const char* repository_path);
+
+/// Check the model repository for changes and update server state
+/// based on those changes.
+///
+/// \param server The inference server object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerPollModelRepository(TRITONSERVER_Server* server);
+
+/// Is the server live?
+///
+/// \param server The inference server object.
+/// \param live Returns true if server is live, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsLive(
+    TRITONSERVER_Server* server, bool* live);
+
+/// Is the server ready?
+///
+/// \param server The inference server object.
+/// \param ready Returns true if server is ready, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerIsReady(
+    TRITONSERVER_Server* server, bool* ready);
+
+/// Is the model ready?
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model to get readiness for.
+/// \param model_version The version of the model to get readiness
+/// for.  If -1 then the server will choose a version based on the
+/// model's policy.
+/// \param ready Returns true if server is ready, false otherwise.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIsReady(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, bool* ready);
+
+/// Get the batch properties of the model. The properties are
+/// communicated by a flags value and an (optional) object returned by
+/// 'voidp'.
+///
+///   - TRITONSERVER_BATCH_UNKNOWN: Triton cannot determine the
+///     batching properties of the model. This means that the model
+///     does not support batching in any way that is useable by
+///     Triton. The returned 'voidp' value is nullptr.
+///
+///   - TRITONSERVER_BATCH_FIRST_DIM: The model supports batching
+///     along the first dimension of every input and output
+///     tensor. Triton schedulers that perform batching can
+///     automatically batch inference requests along this dimension.
+///     The returned 'voidp' value is nullptr.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param flags Returns flags indicating the batch properties of the
+/// model.
+/// \param voidp If non-nullptr, returns a point specific to the
+/// 'flags' value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerModelBatchProperties(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, uint32_t* flags, void** voidp);
+
+/// Get the transaction policy of the model. The policy is
+/// communicated by a flags value.
+///
+///   - TRITONSERVER_TXN_ONE_TO_ONE: The model generates exactly
+///     one response per request.
+///
+///   - TRITONSERVER_TXN_DECOUPLED: The model may generate zero
+///     to many responses per request.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param txn_flags Returns flags indicating the transaction policy of the
+/// model.
+/// \param voidp If non-nullptr, returns a point specific to the 'flags' value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerModelTransactionProperties(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, uint32_t* txn_flags, void** voidp);
+
+/// Get the metadata of the server as a TRITONSERVER_Message object.
+/// The caller takes ownership of the message object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param server_metadata Returns the server metadata message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetadata(
+    TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
+
+/// Get the metadata of a model as a TRITONSERVER_Message
+/// object.  The caller takes ownership of the message object and must
+/// call TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.
+/// If -1 then the server will choose a version based on the model's
+/// policy.
+/// \param model_metadata Returns the model metadata message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelMetadata(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, TRITONSERVER_Message** model_metadata);
+
+/// Get the statistics of a model as a TRITONSERVER_Message
+/// object. The caller takes ownership of the object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// If empty, then statistics for all available models will be returned,
+/// and the server will choose a version based on those models' policies.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param model_stats Returns the model statistics message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelStatistics(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, TRITONSERVER_Message** model_stats);
+
+/// Get the configuration of a model as a TRITONSERVER_Message object.
+/// The caller takes ownership of the message object and must call
+/// TRITONSERVER_MessageDelete to release the object.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param model_version The version of the model.  If -1 then the
+/// server will choose a version based on the model's policy.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned. Currently only version 1 is supported.
+/// \param model_config Returns the model config message.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelConfig(
+    TRITONSERVER_Server* server, const char* model_name,
+    const int64_t model_version, const uint32_t config_version,
+    TRITONSERVER_Message** model_config);
+
+/// Get the index of all unique models in the model repositories as a
+/// TRITONSERVER_Message object. The caller takes ownership of the
+/// message object and must call TRITONSERVER_MessageDelete to release
+/// the object.
+///
+/// If TRITONSERVER_INDEX_FLAG_READY is set in 'flags' only the models
+/// that are loaded into the server and ready for inferencing are
+/// returned.
+///
+/// \param server The inference server object.
+/// \param flags TRITONSERVER_ModelIndexFlag flags that control how to
+/// collect the index.
+/// \param model_index Return the model index message that holds the
+/// index of all models contained in the server's model repository(s).
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerModelIndex(
+    TRITONSERVER_Server* server, uint32_t flags,
+    TRITONSERVER_Message** model_index);
+
+/// Load the requested model or reload the model if it is already
+/// loaded. The function does not return until the model is loaded or
+/// fails to load. Returned error indicates if model loaded
+/// successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerLoadModel(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Load the requested model or reload the model if it is already
+/// loaded, with load parameters provided. The function does not return until
+/// the model is loaded or fails to load. Returned error indicates if model
+/// loaded successfully or not.
+/// Currently the below parameter names are recognized:
+/// - "config" : string parameter that contains a JSON representation of the
+/// model configuration. This config will be used for loading the model instead
+/// of the one in the model directory.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \param parameters The array of load parameters.
+/// \param parameter_count The number of parameters.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerLoadModelWithParameters(
+    TRITONSERVER_Server* server, const char* model_name,
+    const TRITONSERVER_Parameter** parameters, const uint64_t parameter_count);
+
+/// Unload the requested model. Unloading a model that is not loaded
+/// on server has no affect and success code will be returned.
+/// The function does not wait for the requested model to be fully unload
+/// and success code will be returned.
+/// Returned error indicates if model unloaded successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerUnloadModel(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Unload the requested model, and also unload any dependent model that
+/// was loaded along with the requested model (for example, the models composing
+/// an ensemble). Unloading a model that is not loaded
+/// on server has no affect and success code will be returned.
+/// The function does not wait for the requested model and all dependent
+/// models to be fully unload and success code will be returned.
+/// Returned error indicates if model unloaded successfully or not.
+///
+/// \param server The inference server object.
+/// \param model_name The name of the model.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_ServerUnloadModelAndDependents(
+    TRITONSERVER_Server* server, const char* model_name);
+
+/// Get the current metrics for the server. The caller takes ownership
+/// of the metrics object and must call TRITONSERVER_MetricsDelete to
+/// release the object.
+///
+/// \param server The inference server object.
+/// \param metrics Returns the metrics.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerMetrics(
+    TRITONSERVER_Server* server, TRITONSERVER_Metrics** metrics);
+
+/// Perform inference using the meta-data and inputs supplied by the
+/// 'inference_request'. If the function returns success, then the
+/// caller releases ownership of 'inference_request' and must not
+/// access it in any way after this call, until ownership is returned
+/// via the 'request_release_fn' callback registered in the request
+/// object with TRITONSERVER_InferenceRequestSetReleaseCallback.
+///
+/// The function unconditionally takes ownership of 'trace' and so the
+/// caller must not access it in any way after this call (except in
+/// the trace activity callbacks) until ownership is returned via the
+/// trace's release_fn callback.
+///
+/// Responses produced for this request are returned using the
+/// allocator and callback registered with the request by
+/// TRITONSERVER_InferenceRequestSetResponseCallback.
+///
+/// \param server The inference server object.
+/// \param inference_request The request object.
+/// \param trace The trace object for this request, or nullptr if no
+/// tracing.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
+    TRITONSERVER_Server* server,
+    TRITONSERVER_InferenceRequest* inference_request,
+    TRITONSERVER_InferenceTrace* trace);
+
+/// TRITONSERVER_MetricKind
+///
+/// Types of metrics recognized by TRITONSERVER.
+///
+typedef enum TRITONSERVER_metrickind_enum {
+  TRITONSERVER_METRIC_KIND_COUNTER,
+  TRITONSERVER_METRIC_KIND_GAUGE
+} TRITONSERVER_MetricKind;
+
+/// Create a new metric family object. The caller takes ownership of the
+/// TRITONSERVER_MetricFamily object and must call
+/// TRITONSERVER_MetricFamilyDelete to release the object.
+///
+/// \param family Returns the new metric family object.
+/// \param kind The type of metric family to create.
+/// \param name The name of the metric family seen when calling the metrics
+/// endpoint.
+/// \param description The description of the metric family seen when
+/// calling the metrics endpoint.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
+    TRITONSERVER_MetricFamily** family, const TRITONSERVER_MetricKind kind,
+    const char* name, const char* description);
+
+/// Delete a metric family object.
+/// A TRITONSERVER_MetricFamily* object should be deleted AFTER its
+/// corresponding TRITONSERVER_Metric* objects have been deleted.
+/// Attempting to delete a family before its metrics will return an error.
+///
+/// \param family The metric family object to delete.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricFamilyDelete(
+    TRITONSERVER_MetricFamily* family);
+
+/// Create a new metric object. The caller takes ownership of the
+/// TRITONSERVER_Metric object and must call
+/// TRITONSERVER_MetricDelete to release the object. The caller is also
+/// responsible for ownership of the labels passed in. Each label can be deleted
+/// immediately after creating the metric with TRITONSERVER_ParameterDelete
+/// if not re-using the labels.
+///
+/// \param metric Returns the new metric object.
+/// \param family The metric family to add this new metric to.
+/// \param labels The array of labels to associate with this new metric.
+/// \param label_count The number of labels.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricNew(
+    TRITONSERVER_Metric** metric, TRITONSERVER_MetricFamily* family,
+    const TRITONSERVER_Parameter** labels, const uint64_t label_count);
+
+/// Delete a metric object.
+/// All TRITONSERVER_Metric* objects should be deleted BEFORE their
+/// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
+/// If a family is deleted before its metrics, an error will be returned.
+///
+/// \param metric The metric object to delete.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricDelete(
+    TRITONSERVER_Metric* metric);
+
+/// Get the current value of a metric object.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_COUNTER
+/// and TRITONSERVER_METRIC_KIND_GAUGE, and returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
+///
+/// \param metric The metric object to query.
+/// \param value Returns the current value of the metric object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricValue(
+    TRITONSERVER_Metric* metric, double* value);
+
+/// Increment the current value of metric by value.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE for any value,
+/// and TRITONSERVER_METRIC_KIND_COUNTER for non-negative values. Returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind
+/// and TRITONSERVER_ERROR_INVALID_ARG for negative values on a
+/// TRITONSERVER_METRIC_KIND_COUNTER metric.
+///
+/// \param metric The metric object to update.
+/// \param value The amount to increment the metric's value by.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
+    TRITONSERVER_Metric* metric, double value);
+
+/// Set the current value of metric to value.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_GAUGE and returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
+///
+/// \param metric The metric object to update.
+/// \param value The amount to set metric's value to.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_MetricSet(
+    TRITONSERVER_Metric* metric, double value);
+
+/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
+///
+/// \param metric The metric object to query.
+/// \param kind Returns the TRITONSERVER_MetricKind of metric.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC TRITONSERVER_Error* TRITONSERVER_GetMetricKind(
+    TRITONSERVER_Metric* metric, TRITONSERVER_MetricKind* kind);
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/core-r22.12/src/backend_config.cc
+++ b/3rdparty/core-r22.12/src/backend_config.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_config.h"
+
+#include "status.h"
+#include "triton/common/logging.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+namespace {
+
+Status
+GetTFSpecializedBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    std::string* specialized_name)
+{
+  std::string tf_version_str = "2";
+  const auto& itr = config_map.find("tensorflow");
+  if (itr != config_map.end()) {
+    if (BackendConfiguration(itr->second, "version", &tf_version_str).IsOk()) {
+      if ((tf_version_str != "1") && (tf_version_str != "2")) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unexpected TensorFlow library version '" + tf_version_str +
+                "', expects 1 or 2.");
+      }
+    }
+  }
+
+  *specialized_name += tf_version_str;
+
+  return Status::Success;
+}
+}  // namespace
+
+Status
+BackendConfiguration(
+    const triton::common::BackendCmdlineConfig& config, const std::string& key,
+    std::string* val)
+{
+  for (const auto& pr : config) {
+    if (pr.first == key) {
+      *val = pr.second;
+      return Status::Success;
+    }
+  }
+
+  return Status(
+      Status::Code::INTERNAL,
+      std::string("unable to find common backend configuration for '") + key +
+          "'");
+}
+
+Status
+BackendConfigurationParseStringToDouble(const std::string& str, double* val)
+{
+  try {
+    *val = std::stod(str);
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to parse common backend configuration as double");
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationParseStringToBool(const std::string& str, bool* val)
+{
+  try {
+    std::string lowercase_str{str};
+    std::transform(
+        lowercase_str.begin(), lowercase_str.end(), lowercase_str.begin(),
+        [](unsigned char c) { return std::tolower(c); });
+    *val = (lowercase_str == "true");
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to parse common backend configuration as bool");
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationGlobalBackendsDirectory(
+    const triton::common::BackendCmdlineConfigMap& config_map, std::string* dir)
+{
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to find global backends directory configuration");
+  }
+
+  RETURN_IF_ERROR(BackendConfiguration(itr->second, "backend-directory", dir));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationMinComputeCapability(
+    const triton::common::BackendCmdlineConfigMap& config_map, double* mcc)
+{
+#ifdef TRITON_ENABLE_GPU
+  *mcc = TRITON_MIN_COMPUTE_CAPABILITY;
+#else
+  *mcc = 0;
+#endif  // TRITON_ENABLE_GPU
+
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL, "unable to find common backend configuration");
+  }
+
+  std::string min_compute_capability_str;
+  RETURN_IF_ERROR(BackendConfiguration(
+      itr->second, "min-compute-capability", &min_compute_capability_str));
+  RETURN_IF_ERROR(
+      BackendConfigurationParseStringToDouble(min_compute_capability_str, mcc));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationAutoCompleteConfig(
+    const triton::common::BackendCmdlineConfigMap& config_map, bool* acc)
+{
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL, "unable to find auto-complete configuration");
+  }
+
+  std::string auto_complete_config_str;
+  RETURN_IF_ERROR(BackendConfiguration(
+      itr->second, "auto-complete-config", &auto_complete_config_str));
+  RETURN_IF_ERROR(
+      BackendConfigurationParseStringToBool(auto_complete_config_str, acc));
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationSpecializeBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const std::string& backend_name, std::string* specialized_name)
+{
+  *specialized_name = backend_name;
+  if (backend_name == "tensorflow") {
+    RETURN_IF_ERROR(GetTFSpecializedBackendName(config_map, specialized_name));
+  }
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationBackendLibraryName(
+    const std::string& backend_name, std::string* libname)
+{
+#ifdef _WIN32
+  *libname = "triton_" + backend_name + ".dll";
+#else
+  *libname = "libtriton_" + backend_name + ".so";
+#endif
+
+  return Status::Success;
+}
+
+Status
+BackendConfigurationModelLoadGpuFraction(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const int device_id, double* memory_limit)
+{
+  *memory_limit = 1.0;
+  const auto& itr = config_map.find(std::string());
+  if (itr == config_map.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to find global backends directory configuration");
+  }
+
+  static std::string key_prefix = "model-load-gpu-limit-device-";
+  std::string memory_limit_str;
+  auto status = BackendConfiguration(
+      itr->second, key_prefix + std::to_string(device_id), &memory_limit_str);
+  // Allow missing key, default to 1.0 (no limit) if the limit is not specified
+  if (status.IsOk()) {
+    RETURN_IF_ERROR(BackendConfigurationParseStringToDouble(
+        memory_limit_str, memory_limit));
+  }
+
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_config.h
+++ b/3rdparty/core-r22.12/src/backend_config.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "status.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+/// Get a key's string value from a backend configuration.
+Status BackendConfiguration(
+    const triton::common::BackendCmdlineConfig& config, const std::string& key,
+    std::string* val);
+
+/// Convert a backend configuration string  value into a double.
+Status BackendConfigurationParseStringToDouble(
+    const std::string& str, double* val);
+
+/// Convert a backend configuration string  value into a bool.
+Status BackendConfigurationParseStringToBool(const std::string& str, bool* val);
+
+/// Get the global backends directory from the backend configuration.
+Status BackendConfigurationGlobalBackendsDirectory(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    std::string* dir);
+
+/// Get the minimum compute capability from the backend configuration.
+Status BackendConfigurationMinComputeCapability(
+    const triton::common::BackendCmdlineConfigMap& config_map, double* mcc);
+
+/// Get the model configuration auto-complete setting from the backend
+/// configuration.
+Status BackendConfigurationAutoCompleteConfig(
+    const triton::common::BackendCmdlineConfigMap& config_map, bool* acc);
+
+/// Convert a backend name to the specialized version of that name
+/// based on the backend configuration. For example, "tensorflow" will
+/// convert to either "tensorflow1" or "tensorflow2" depending on how
+/// tritonserver is run.
+Status BackendConfigurationSpecializeBackendName(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const std::string& backend_name, std::string* specialized_name);
+
+/// Return the shared library name for a backend.
+Status BackendConfigurationBackendLibraryName(
+    const std::string& backend_name, std::string* libname);
+
+/// Get GPU memory limit fraction for model loading
+/// from the backend configuration.
+Status BackendConfigurationModelLoadGpuFraction(
+    const triton::common::BackendCmdlineConfigMap& config_map,
+    const int device_id, double* memory_limit);
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_manager.cc
+++ b/3rdparty/core-r22.12/src/backend_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_manager.h"
+
+#include "backend_memory_manager.h"
+#include "server_message.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+//
+// TritonBackend
+//
+Status
+TritonBackend::Create(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+    std::shared_ptr<TritonBackend>* backend)
+{
+  // Create the JSON representation of the backend configuration.
+  triton::common::TritonJson::Value backend_config_json(
+      triton::common::TritonJson::ValueType::OBJECT);
+  if (!backend_cmdline_config.empty()) {
+    triton::common::TritonJson::Value cmdline_json(
+        backend_config_json, triton::common::TritonJson::ValueType::OBJECT);
+    for (const auto& pr : backend_cmdline_config) {
+      RETURN_IF_ERROR(cmdline_json.AddString(pr.first.c_str(), pr.second));
+    }
+
+    RETURN_IF_ERROR(
+        backend_config_json.Add("cmdline", std::move(cmdline_json)));
+  }
+
+  TritonServerMessage backend_config(backend_config_json);
+
+  auto local_backend = std::shared_ptr<TritonBackend>(
+      new TritonBackend(name, dir, libpath, backend_config));
+
+  // Load the library and initialize all the entrypoints
+  RETURN_IF_ERROR(local_backend->LoadBackendLibrary());
+
+  // Backend initialization is optional... The TRITONBACKEND_Backend
+  // object is this TritonBackend object. We must set set shared
+  // library path to point to the backend directory in case the
+  // backend library attempts to load additional shared libaries.
+  if (local_backend->backend_init_fn_ != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(local_backend->dir_));
+
+    TRITONSERVER_Error* err = local_backend->backend_init_fn_(
+        reinterpret_cast<TRITONBACKEND_Backend*>(local_backend.get()));
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  local_backend->UpdateAttributes();
+
+  *backend = std::move(local_backend);
+  return Status::Success;
+}
+
+Status
+TritonBackend::UpdateAttributes()
+{
+  if (backend_attri_fn_ == nullptr) {
+    return Status::Success;
+  }
+
+  // Create an Attribute object for the backend to fill, note that it copies
+  // some fields from 'attributes_' while the others use default value. This
+  // is an ad hoc way to determine whether the attribute is set by the backend
+  // and keep / update current value.
+  Attribute latest;
+  latest.exec_policy_ = attributes_.exec_policy_;
+  RETURN_IF_TRITONSERVER_ERROR(backend_attri_fn_(
+      reinterpret_cast<TRITONBACKEND_Backend*>(this),
+      reinterpret_cast<TRITONBACKEND_BackendAttribute*>(&latest)));
+
+  // Update attributes that were set
+  attributes_.exec_policy_ = latest.exec_policy_;
+  if (!latest.preferred_groups_.empty()) {
+    attributes_.preferred_groups_ = latest.preferred_groups_;
+  }
+  return Status::Success;
+}
+
+TritonBackend::TritonBackend(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const TritonServerMessage& backend_config)
+    : name_(name), dir_(dir), libpath_(libpath),
+      backend_config_(backend_config), state_(nullptr)
+{
+  ClearHandles();
+}
+
+TritonBackend::~TritonBackend()
+{
+  LOG_VERBOSE(1) << "unloading backend '" << name_ << "'";
+
+  // Backend finalization is optional... The TRITONBACKEND_Backend
+  // object is this TritonBackend object.
+  if (backend_fini_fn_ != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        backend_fini_fn_(reinterpret_cast<TRITONBACKEND_Backend*>(this)),
+        "failed finalizing backend");
+  }
+
+  ClearHandles();
+}
+
+void
+TritonBackend::ClearHandles()
+{
+  dlhandle_ = nullptr;
+  backend_init_fn_ = nullptr;
+  backend_fini_fn_ = nullptr;
+  backend_attri_fn_ = nullptr;
+  model_init_fn_ = nullptr;
+  model_fini_fn_ = nullptr;
+  inst_init_fn_ = nullptr;
+  inst_fini_fn_ = nullptr;
+  inst_exec_fn_ = nullptr;
+}
+
+Status
+TritonBackend::LoadBackendLibrary()
+{
+  TritonBackendInitFn_t bifn;
+  TritonBackendFiniFn_t bffn;
+  TritonBackendAttriFn_t bafn;
+  TritonModelInitFn_t mifn;
+  TritonModelFiniFn_t mffn;
+  TritonModelInstanceInitFn_t iifn;
+  TritonModelInstanceFiniFn_t iffn;
+  TritonModelInstanceExecFn_t iefn;
+
+  {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+
+    RETURN_IF_ERROR(slib->OpenLibraryHandle(libpath_, &dlhandle_));
+
+    // Backend initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_Initialize", true /* optional */,
+        reinterpret_cast<void**>(&bifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_Finalize", true /* optional */,
+        reinterpret_cast<void**>(&bffn)));
+    // Backend attribute function, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_GetBackendAttribute", true /* optional */,
+        reinterpret_cast<void**>(&bafn)));
+
+    // Model initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInitialize", true /* optional */,
+        reinterpret_cast<void**>(&mifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelFinalize", true /* optional */,
+        reinterpret_cast<void**>(&mffn)));
+
+    // Model instance initialize and finalize functions, optional
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceInitialize", true /* optional */,
+        reinterpret_cast<void**>(&iifn)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceFinalize", true /* optional */,
+        reinterpret_cast<void**>(&iffn)));
+
+    // Model instance execute function, required
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        dlhandle_, "TRITONBACKEND_ModelInstanceExecute", false /* optional */,
+        reinterpret_cast<void**>(&iefn)));
+  }
+
+  backend_init_fn_ = bifn;
+  backend_fini_fn_ = bffn;
+  backend_attri_fn_ = bafn;
+  model_init_fn_ = mifn;
+  model_fini_fn_ = mffn;
+  inst_init_fn_ = iifn;
+  inst_fini_fn_ = iffn;
+  inst_exec_fn_ = iefn;
+
+  return Status::Success;
+}
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ApiVersion(uint32_t* major, uint32_t* minor)
+{
+  *major = TRITONBACKEND_API_VERSION_MAJOR;
+  *minor = TRITONBACKEND_API_VERSION_MINOR;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendName(TRITONBACKEND_Backend* backend, const char** name)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *name = tb->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendConfig(
+    TRITONBACKEND_Backend* backend, TRITONSERVER_Message** backend_config)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *backend_config = const_cast<TRITONSERVER_Message*>(
+      reinterpret_cast<const TRITONSERVER_Message*>(&tb->BackendConfig()));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy* policy)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *policy = tb->ExecutionPolicy();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendSetExecutionPolicy(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ExecutionPolicy policy)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  tb->SetExecutionPolicy(policy);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendArtifacts(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
+  *location = tb->Directory().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendMemoryManager(
+    TRITONBACKEND_Backend* backend, TRITONBACKEND_MemoryManager** manager)
+{
+  static TritonMemoryManager gMemoryManager;
+  *manager = reinterpret_cast<TRITONBACKEND_MemoryManager*>(&gMemoryManager);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendState(TRITONBACKEND_Backend* backend, void** state)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  *state = tb->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendSetState(TRITONBACKEND_Backend* backend, void* state)
+{
+  TritonBackend* tb = reinterpret_cast<TritonBackend*>(backend);
+  tb->SetState(state);
+  return nullptr;  // success
+}
+
+}  // extern C
+
+//
+// TritonBackendManager
+//
+
+static std::weak_ptr<TritonBackendManager> backend_manager_;
+static std::mutex mu_;
+
+Status
+TritonBackendManager::Create(std::shared_ptr<TritonBackendManager>* manager)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  // If there is already a manager then we just use it...
+  *manager = backend_manager_.lock();
+  if (*manager != nullptr) {
+    return Status::Success;
+  }
+
+  manager->reset(new TritonBackendManager());
+  backend_manager_ = *manager;
+
+  return Status::Success;
+}
+
+Status
+TritonBackendManager::CreateBackend(
+    const std::string& name, const std::string& dir, const std::string& libpath,
+    const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+    std::shared_ptr<TritonBackend>* backend)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  const auto& itr = backend_map_.find(libpath);
+  if (itr != backend_map_.end()) {
+    *backend = itr->second;
+    return Status::Success;
+  }
+
+  RETURN_IF_ERROR(TritonBackend::Create(
+      name, dir, libpath, backend_cmdline_config, backend));
+  backend_map_.insert({libpath, *backend});
+
+  return Status::Success;
+}
+
+Status
+TritonBackendManager::BackendState(
+    std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>*
+        backend_state)
+{
+  std::lock_guard<std::mutex> lock(mu_);
+
+  std::unique_ptr<std::unordered_map<std::string, std::vector<std::string>>>
+      backend_state_map(
+          new std::unordered_map<std::string, std::vector<std::string>>);
+  for (const auto& backend_pair : backend_map_) {
+    auto& libpath = backend_pair.first;
+    auto backend = backend_pair.second;
+
+    const char* backend_config;
+    size_t backend_config_size;
+    backend->BackendConfig().Serialize(&backend_config, &backend_config_size);
+    backend_state_map->insert(
+        {backend->Name(), std::vector<std::string>{libpath, backend_config}});
+  }
+
+  *backend_state = std::move(backend_state_map);
+
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_manager.h
+++ b/3rdparty/core-r22.12/src/backend_manager.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include "constants.h"
+#include "server_message.h"
+#include "status.h"
+#include "triton/common/model_config.h"
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+//
+// Proxy to a backend shared library.
+//
+class TritonBackend {
+ public:
+  struct Attribute {
+    Attribute() : exec_policy_(TRITONBACKEND_EXECUTION_BLOCKING) {}
+    TRITONBACKEND_ExecutionPolicy exec_policy_;
+    std::vector<inference::ModelInstanceGroup> preferred_groups_;
+  };
+  typedef TRITONSERVER_Error* (*TritonModelInitFn_t)(
+      TRITONBACKEND_Model* model);
+  typedef TRITONSERVER_Error* (*TritonModelFiniFn_t)(
+      TRITONBACKEND_Model* model);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceInitFn_t)(
+      TRITONBACKEND_ModelInstance* instance);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceFiniFn_t)(
+      TRITONBACKEND_ModelInstance* instance);
+  typedef TRITONSERVER_Error* (*TritonModelInstanceExecFn_t)(
+      TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+      const uint32_t request_cnt);
+
+  static Status Create(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath,
+      const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+      std::shared_ptr<TritonBackend>* backend);
+  ~TritonBackend();
+
+  const std::string& Name() const { return name_; }
+  const std::string& Directory() const { return dir_; }
+  const TritonServerMessage& BackendConfig() const { return backend_config_; }
+  const Attribute& BackendAttributes() const { return attributes_; }
+
+  TRITONBACKEND_ExecutionPolicy ExecutionPolicy() const
+  {
+    return attributes_.exec_policy_;
+  }
+  void SetExecutionPolicy(const TRITONBACKEND_ExecutionPolicy policy)
+  {
+    attributes_.exec_policy_ = policy;
+  }
+
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  TritonModelInitFn_t ModelInitFn() const { return model_init_fn_; }
+  TritonModelFiniFn_t ModelFiniFn() const { return model_fini_fn_; }
+  TritonModelInstanceInitFn_t ModelInstanceInitFn() const
+  {
+    return inst_init_fn_;
+  }
+  TritonModelInstanceFiniFn_t ModelInstanceFiniFn() const
+  {
+    return inst_fini_fn_;
+  }
+  TritonModelInstanceExecFn_t ModelInstanceExecFn() const
+  {
+    return inst_exec_fn_;
+  }
+
+ private:
+  typedef TRITONSERVER_Error* (*TritonBackendInitFn_t)(
+      TRITONBACKEND_Backend* backend);
+  typedef TRITONSERVER_Error* (*TritonBackendFiniFn_t)(
+      TRITONBACKEND_Backend* backend);
+  typedef TRITONSERVER_Error* (*TritonBackendAttriFn_t)(
+      TRITONBACKEND_Backend* backend,
+      TRITONBACKEND_BackendAttribute* backend_attributes);
+
+  TritonBackend(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath, const TritonServerMessage& backend_config);
+
+  void ClearHandles();
+  Status LoadBackendLibrary();
+
+  Status UpdateAttributes();
+
+  // The name of the backend.
+  const std::string name_;
+
+  // Full path to the directory holding backend shared library and
+  // other artifacts.
+  const std::string dir_;
+
+  // Full path to the backend shared library.
+  const std::string libpath_;
+
+  // Backend configuration as JSON
+  TritonServerMessage backend_config_;
+
+  // backend attributes
+  Attribute attributes_;
+
+  // dlopen / dlsym handles
+  void* dlhandle_;
+  TritonBackendInitFn_t backend_init_fn_;
+  TritonBackendFiniFn_t backend_fini_fn_;
+  TritonBackendAttriFn_t backend_attri_fn_;
+  TritonModelInitFn_t model_init_fn_;
+  TritonModelFiniFn_t model_fini_fn_;
+  TritonModelInstanceInitFn_t inst_init_fn_;
+  TritonModelInstanceFiniFn_t inst_fini_fn_;
+  TritonModelInstanceExecFn_t inst_exec_fn_;
+
+  // Opaque state associated with the backend.
+  void* state_;
+};
+
+//
+// Manage communication with Triton backends and their lifecycle.
+//
+class TritonBackendManager {
+ public:
+  static Status Create(std::shared_ptr<TritonBackendManager>* manager);
+
+  Status CreateBackend(
+      const std::string& name, const std::string& dir,
+      const std::string& libpath,
+      const triton::common::BackendCmdlineConfig& backend_cmdline_config,
+      std::shared_ptr<TritonBackend>* backend);
+
+  Status BackendState(
+      std::unique_ptr<
+          std::unordered_map<std::string, std::vector<std::string>>>*
+          backend_state);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonBackendManager);
+  TritonBackendManager() = default;
+  std::unordered_map<std::string, std::shared_ptr<TritonBackend>> backend_map_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_memory_manager.cc
+++ b/3rdparty/core-r22.12/src/backend_memory_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_memory_manager.h"
+
+#include "pinned_memory_manager.h"
+#include "status.h"
+#include "tritonserver_apis.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#include "cuda_memory_manager.h"
+#endif  // TRITON_ENABLE_GPU
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_MemoryManagerAllocate(
+    TRITONBACKEND_MemoryManager* manager, void** buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id,
+    const uint64_t byte_size)
+{
+  switch (memory_type) {
+    case TRITONSERVER_MEMORY_GPU:
+#ifdef TRITON_ENABLE_GPU
+    {
+      auto status = CudaMemoryManager::Alloc(buffer, byte_size, memory_type_id);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.ErrorCode()),
+            status.Message().c_str());
+      }
+      break;
+    }
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "GPU memory allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+
+    case TRITONSERVER_MEMORY_CPU_PINNED:
+#ifdef TRITON_ENABLE_GPU
+    {
+      TRITONSERVER_MemoryType mt = memory_type;
+      auto status = PinnedMemoryManager::Alloc(buffer, byte_size, &mt, false);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.ErrorCode()),
+            status.Message().c_str());
+      }
+      break;
+    }
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "Pinned memory allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+
+    case TRITONSERVER_MEMORY_CPU: {
+      *buffer = malloc(byte_size);
+      if (*buffer == nullptr) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNAVAILABLE, "CPU memory allocation failed");
+      }
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_MemoryManagerFree(
+    TRITONBACKEND_MemoryManager* manager, void* buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+{
+  switch (memory_type) {
+    case TRITONSERVER_MEMORY_GPU: {
+#ifdef TRITON_ENABLE_GPU
+      auto status = CudaMemoryManager::Free(buffer, memory_type_id);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.StatusCode()),
+            status.Message().c_str());
+      }
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case TRITONSERVER_MEMORY_CPU_PINNED: {
+#ifdef TRITON_ENABLE_GPU
+      auto status = PinnedMemoryManager::Free(buffer);
+      if (!status.IsOk()) {
+        return TRITONSERVER_ErrorNew(
+            StatusCodeToTritonCode(status.StatusCode()),
+            status.Message().c_str());
+      }
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case TRITONSERVER_MEMORY_CPU:
+      free(buffer);
+      break;
+  }
+
+  return nullptr;  // success
+}
+
+}  // extern C
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_memory_manager.h
+++ b/3rdparty/core-r22.12/src/backend_memory_manager.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+namespace triton { namespace core {
+
+// Currently there is just a global memory manager that is used for
+// all backends and which simply forwards requests on to the core
+// memory manager.
+struct TritonMemoryManager {
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model.cc
+++ b/3rdparty/core-r22.12/src/backend_model.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_model.h"
+
+#include <vector>
+#include "backend_config.h"
+#include "backend_model_instance.h"
+#include "dynamic_batch_scheduler.h"
+#include "filesystem.h"
+#include "model_config_utils.h"
+#include "numa_utils.h"
+#include "sequence_batch_scheduler.h"
+#include "sequence_state.h"
+#include "server.h"
+#include "server_message.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+#include "tritonserver_apis.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+Status
+TritonModel::Create(
+    InferenceServer* server, const std::string& model_path,
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+    const std::string& model_name, const int64_t version,
+    inference::ModelConfig model_config, const bool is_config_provided,
+    std::unique_ptr<TritonModel>* model)
+{
+  model->reset();
+
+  // The model configuration must specify a backend. The name of the
+  // corresponding shared library must be libtriton_<backend>.so.
+  if (model_config.backend().empty()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify 'backend' for '" + model_config.name() + "'");
+  }
+
+  // Localize the content of the model repository corresponding to
+  // 'model_name'. This model holds a handle to the localized content
+  // so that it persists as long as the model is loaded.
+  std::shared_ptr<LocalizedPath> localized_model_dir;
+  RETURN_IF_ERROR(LocalizePath(model_path, &localized_model_dir));
+
+  // Localize paths in backend model config
+  // [FIXME] Remove once a more permanent solution is implemented (DLIS-4211)
+  RETURN_IF_ERROR(LocalizePythonBackendExecutionEnvironmentPath(
+      model_path, &model_config, &localized_model_dir));
+
+  // Get some internal configuration values needed for initialization.
+  std::string backend_dir;
+  RETURN_IF_ERROR(BackendConfigurationGlobalBackendsDirectory(
+      backend_cmdline_config_map, &backend_dir));
+
+  bool auto_complete_config = false;
+  RETURN_IF_ERROR(BackendConfigurationAutoCompleteConfig(
+      backend_cmdline_config_map, &auto_complete_config));
+
+  double min_compute_capability = 0;
+  RETURN_IF_ERROR(BackendConfigurationMinComputeCapability(
+      backend_cmdline_config_map, &min_compute_capability));
+
+  std::string specialized_backend_name;
+  RETURN_IF_ERROR(BackendConfigurationSpecializeBackendName(
+      backend_cmdline_config_map, model_config.backend(),
+      &specialized_backend_name));
+
+  std::string backend_libname;
+  RETURN_IF_ERROR(BackendConfigurationBackendLibraryName(
+      specialized_backend_name, &backend_libname));
+
+  // Get the path to the backend shared library. Search path is
+  // version directory, model directory, global backend directory.
+  const auto localized_model_path = localized_model_dir->Path();
+  const auto version_path =
+      JoinPath({localized_model_path, std::to_string(version)});
+  const std::string global_path =
+      JoinPath({backend_dir, specialized_backend_name});
+  const std::vector<std::string> search_paths = {
+      version_path, localized_model_path, global_path};
+
+  std::string backend_libdir;
+  std::string backend_libpath;
+  for (const auto& path : search_paths) {
+    const auto full_path = JoinPath({path, backend_libname});
+    bool exists = false;
+    RETURN_IF_ERROR(FileExists(full_path, &exists));
+    if (exists) {
+      backend_libdir = path;
+      backend_libpath = full_path;
+      break;
+    }
+  }
+
+  if (backend_libpath.empty()) {
+    return Status(
+        Status::Code::INVALID_ARG, "unable to find '" + backend_libname +
+                                       "' for model '" + model_config.name() +
+                                       "', searched: " + version_path + ", " +
+                                       model_path + ", " + global_path);
+  }
+
+  // Resolve the global backend configuration with the specific backend
+  // configuration
+  triton::common::BackendCmdlineConfig config;
+  RETURN_IF_ERROR(ResolveBackendConfigs(
+      backend_cmdline_config_map, model_config.backend(), config));
+
+  RETURN_IF_ERROR(SetBackendConfigDefaults(config));
+
+  std::shared_ptr<TritonBackend> backend;
+  RETURN_IF_ERROR(server->BackendManager()->CreateBackend(
+      model_config.backend(), backend_libdir, backend_libpath, config,
+      &backend));
+
+  // Normalize backend-dependent config
+  {
+    const auto& attributes = backend->BackendAttributes();
+    // [WIP] formalize config normalization / validation
+    RETURN_IF_ERROR(NormalizeInstanceGroup(
+        min_compute_capability, attributes.preferred_groups_, &model_config));
+    RETURN_IF_ERROR(
+        ValidateInstanceGroup(model_config, min_compute_capability));
+  }
+
+  // Create and initialize the model.
+  std::unique_ptr<TritonModel> local_model(new TritonModel(
+      server, localized_model_dir, backend, min_compute_capability, version,
+      model_config, auto_complete_config));
+
+  TritonModel* raw_local_model = local_model.get();
+
+  // Model initialization is optional... The TRITONBACKEND_Model
+  // object is this TritonModel object. We must set set shared library
+  // path to point to the backend directory in case the backend
+  // library attempts to load additional shared libaries.
+  if (backend->ModelInitFn() != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(backend->Directory()));
+
+    TRITONSERVER_Error* err = backend->ModelInitFn()(
+        reinterpret_cast<TRITONBACKEND_Model*>(raw_local_model));
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  // Initialize the model for Triton core usage
+  RETURN_IF_ERROR(local_model->Init(is_config_provided));
+
+  bool device_blocking = false;
+  if (local_model->backend_->ExecutionPolicy() ==
+      TRITONBACKEND_EXECUTION_DEVICE_BLOCKING) {
+    if (model_config.has_sequence_batching()) {
+      LOG_INFO << "Overriding execution policy to "
+                  "\"TRITONBACKEND_EXECUTION_BLOCKING\" for sequence model \""
+               << model_config.name() << "\"";
+    } else {
+      device_blocking = true;
+    }
+  }
+
+  // Create and initialize the model instances for this model.
+  RETURN_IF_ERROR(TritonModelInstance::CreateInstances(
+      raw_local_model, backend_cmdline_config_map, host_policy_map,
+      model_config, device_blocking));
+
+  RETURN_IF_ERROR(local_model->SetConfiguredScheduler());
+
+  *model = std::move(local_model);
+  return Status::Success;
+}
+
+Status
+TritonModel::ResolveBackendConfigs(
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const std::string& backend_name,
+    triton::common::BackendCmdlineConfig& config)
+{
+  const auto& global_itr = backend_cmdline_config_map.find(std::string());
+  const auto& specific_itr = backend_cmdline_config_map.find(backend_name);
+  if (specific_itr == backend_cmdline_config_map.end() &&
+      global_itr != backend_cmdline_config_map.end()) {
+    for (auto setting : global_itr->second) {
+      config.push_back(setting);
+    }
+  } else if (
+      specific_itr != backend_cmdline_config_map.end() &&
+      global_itr == backend_cmdline_config_map.end()) {
+    for (auto setting : specific_itr->second) {
+      config.push_back(setting);
+    }
+  } else if (
+      specific_itr != backend_cmdline_config_map.end() &&
+      global_itr != backend_cmdline_config_map.end()) {
+    triton::common::BackendCmdlineConfig global_backend_config =
+        global_itr->second;
+    triton::common::BackendCmdlineConfig specific_backend_config =
+        specific_itr->second;
+
+    std::sort(global_backend_config.begin(), global_backend_config.end());
+    std::sort(specific_backend_config.begin(), specific_backend_config.end());
+
+    size_t global_index = 0;
+    size_t specific_index = 0;
+    while (global_index < global_backend_config.size() &&
+           specific_index < specific_backend_config.size()) {
+      auto& current_global_setting = global_backend_config.at(global_index);
+      auto& current_specific_setting =
+          specific_backend_config.at(specific_index);
+      if (current_specific_setting.first.compare(
+              current_global_setting.first) == 0) {
+        // specific setting overrides global setting
+        config.push_back(current_specific_setting);
+        ++global_index;
+        ++specific_index;
+      } else if (
+          current_specific_setting.first.compare(current_global_setting.first) <
+          0) {
+        config.push_back(current_specific_setting);
+        ++specific_index;
+      } else {
+        config.push_back(current_global_setting);
+        ++global_index;
+      }
+    }
+
+    // add the rest of the global configs
+    if (global_index < global_backend_config.size()) {
+      auto& current_global_setting = global_backend_config.at(global_index);
+      config.push_back(current_global_setting);
+    }
+
+    // add the rest of the specific settings
+    if (specific_index < specific_backend_config.size()) {
+      auto& current_specific_setting =
+          specific_backend_config.at(specific_index);
+      config.push_back(current_specific_setting);
+    }
+  }  // else empty config
+
+  return Status::Success;
+}
+
+
+const std::unordered_map<std::string, std::string> backend_config_defaults(
+    {{"default-max-batch-size", "4"}});
+
+Status
+TritonModel::SetBackendConfigDefaults(
+    triton::common::BackendCmdlineConfig& config)
+{
+  auto backend_config_defaults_copy = backend_config_defaults;
+
+  for (auto& setting : config) {
+    if (setting.first.compare("default-max-batch-size") == 0) {
+      LOG_VERBOSE(1) << "Found overwritten default setting: " << setting.first
+                     << "," << setting.second;
+      backend_config_defaults_copy.erase(setting.first);
+    }
+
+    if (backend_config_defaults_copy.empty()) {
+      break;
+    }
+  }
+
+  // Anything left should be added to the config
+  for (const auto& default_setting : backend_config_defaults_copy) {
+    LOG_VERBOSE(1) << "Adding default backend config setting: "
+                   << default_setting.first << "," << default_setting.second;
+    config.push_back(
+        std::make_pair(default_setting.first, default_setting.second));
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::AddInstance(
+    std::unique_ptr<TritonModelInstance>&& instance, const bool passive)
+{
+  if (passive) {
+    passive_instances_.emplace_back(std::move(instance));
+  } else {
+    instances_.emplace_back(std::move(instance));
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::UpdateModelConfig(
+    const uint32_t config_version, TRITONSERVER_Message* updated_config_message)
+{
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_TRITONSERVER_ERROR(TRITONSERVER_MessageSerializeToJson(
+      updated_config_message, &buffer, &byte_size));
+  inference::ModelConfig updated_config;
+  RETURN_IF_ERROR(
+      JsonToModelConfig({buffer, byte_size}, config_version, &updated_config));
+  auto config = Config();
+  config.set_max_batch_size(updated_config.max_batch_size());
+
+  auto inputs_config = config.mutable_input();
+  *inputs_config = updated_config.input();
+  auto outputs_config = config.mutable_output();
+  *outputs_config = updated_config.output();
+
+  if (!config.scheduling_choice_case()) {
+    if (updated_config.has_dynamic_batching()) {
+      auto dynamic_batching_config = config.mutable_dynamic_batching();
+      *dynamic_batching_config = updated_config.dynamic_batching();
+    } else if (updated_config.has_sequence_batching()) {
+      auto sequence_batching_config = config.mutable_sequence_batching();
+      *sequence_batching_config = updated_config.sequence_batching();
+    } else if (updated_config.has_ensemble_scheduling()) {
+      auto ensemble_scheduling_config = config.mutable_ensemble_scheduling();
+      *ensemble_scheduling_config = updated_config.ensemble_scheduling();
+    }  // else do nothing
+  } else if (
+      config.scheduling_choice_case() !=
+      updated_config.scheduling_choice_case()) {
+    return Status(
+        triton::common::Error::Code::INTERNAL,
+        (std::string("Cannot update scheduling choice from ") +
+         std::to_string(config.scheduling_choice_case()) + std::string(" to ") +
+         std::to_string(config.scheduling_choice_case()) +
+         std::string(" when auto-completing."))
+            .c_str());
+  }  // else do nothing
+
+  // Need to normalize the model configuration for
+  // populating missing fields.
+  RETURN_IF_ERROR(NormalizeModelConfig(min_compute_capability_, &config));
+
+  RETURN_IF_ERROR(SetModelConfig(config));
+
+  return Status::Success;
+}
+
+Status
+TritonModel::SetConfiguredScheduler()
+{
+  std::unique_ptr<Scheduler> scheduler;
+
+  // Need to enforce equal shape batches (i.e. non-ragged batches) if
+  // the model 1) allows one or more variable-size input tensors that
+  // are not marked as 'allow_ragged_batch' or 2) has one or more
+  // shape-tensor inputs. This is not needed if all input shapes are
+  // non-variable and if there are no shape tensors... so we don't
+  // enable it in that case for efficiency reasons.
+  std::unordered_map<std::string, bool> enforce_equal_shape_tensors;
+  for (const auto input : config_.input()) {
+    if (input.is_shape_tensor()) {
+      enforce_equal_shape_tensors.insert({input.name(), true});
+    } else if (
+        !input.allow_ragged_batch() &&
+        (triton::common::GetElementCount(input) == -1)) {
+      enforce_equal_shape_tensors.insert({input.name(), false});
+    }
+  }
+
+  // If 'sequence_batching' is configured, then use the SequenceBatchScheduler,
+  // otherwise use the default DynamicBatchScheduler.
+  if (config_.has_sequence_batching()) {
+    // Sequence batcher
+    RETURN_IF_ERROR(SequenceBatchScheduler::Create(
+        this, enforce_equal_shape_tensors, &scheduler));
+  } else if (config_.has_dynamic_batching()) {
+    // Dynamic batcher
+    RETURN_IF_ERROR(DynamicBatchScheduler::Create(
+        this, nullptr, 0 /*nice*/, true /* dynamic_batching_enabled */,
+        config_.max_batch_size(), enforce_equal_shape_tensors,
+        config_.dynamic_batching(),
+        config_.response_cache().enable() /* response_cache_enable */,
+        &scheduler));
+  } else {
+    // Default scheduler. Use dynamic batch scheduler (with batching
+    // disabled) as the default scheduler.
+    RETURN_IF_ERROR(DynamicBatchScheduler::Create(
+        this, nullptr, 0 /*nice*/, false /* dynamic_batching_enabled */,
+        1 /* max_batch_size */,
+        std::unordered_map<
+            std::string, bool>() /* enforce_equal_shape_tensors */,
+        false /* preserve_ordering */,
+        config_.response_cache().enable() /* response_cache_enable */,
+        std::set<int32_t>() /* preferred_batch_sizes */,
+        0 /* max_queue_delay_microseconds */, &scheduler));
+  }
+
+  return SetScheduler(std::move(scheduler));
+}
+
+Status
+TritonModel::Initialize()
+{
+  for (const auto& instance : instances_) {
+    RETURN_IF_ERROR(instance->Initialize());
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModel::WarmUp()
+{
+  for (const auto& instance : instances_) {
+    RETURN_IF_ERROR(instance->WarmUp());
+  }
+
+  return Status::Success;
+}
+
+TritonModel::TritonModel(
+    InferenceServer* server,
+    const std::shared_ptr<LocalizedPath>& localized_model_dir,
+    const std::shared_ptr<TritonBackend>& backend,
+    const double min_compute_capability, const int64_t version,
+    const inference::ModelConfig& config, const bool auto_complete_config)
+    : Model(
+          min_compute_capability, localized_model_dir->Path(), version, config),
+      server_(server), min_compute_capability_(min_compute_capability),
+      auto_complete_config_(auto_complete_config),
+      localized_model_dir_(localized_model_dir), backend_(backend),
+      state_(nullptr)
+{
+}
+
+TritonModel::~TritonModel()
+{
+  // Explicitly delete/finalize all model instances before finalizing
+  // the model itself.
+  instances_.clear();
+  passive_instances_.clear();
+
+  // Unregister itself from the rate limiter. Note this should happen
+  // after all instances are destructed. Destrucing instances ensures
+  // there are no instance threads waiting on rate limiter for
+  // receiving their payloads.
+  server_->GetRateLimiter()->UnregisterModel(this);
+
+  // Model finalization is optional... The TRITONBACKEND_Model
+  // object is this TritonModel object.
+  if (backend_->ModelFiniFn() != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        backend_->ModelFiniFn()(reinterpret_cast<TRITONBACKEND_Model*>(this)),
+        "failed finalizing model");
+  }
+}
+
+extern "C" {
+
+//
+// TRITONBACKEND_Model
+//
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelName(TRITONBACKEND_Model* model, const char** name)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *name = tm->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelVersion(TRITONBACKEND_Model* model, uint64_t* version)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *version = tm->Version();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelRepository(
+    TRITONBACKEND_Model* model, TRITONBACKEND_ArtifactType* artifact_type,
+    const char** location)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *artifact_type = TRITONBACKEND_ARTIFACT_FILESYSTEM;
+  *location = tm->LocalizedModelPath().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message** model_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+
+  std::string model_config_json;
+  Status status =
+      ModelConfigToJson(tm->Config(), config_version, &model_config_json);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *model_config = reinterpret_cast<TRITONSERVER_Message*>(
+      new TritonServerMessage(std::move(model_config_json)));
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelAutoCompleteConfig(
+    TRITONBACKEND_Model* model, bool* auto_complete_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *auto_complete_config = tm->AutoCompleteConfig();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelSetConfig(
+    TRITONBACKEND_Model* model, const uint32_t config_version,
+    TRITONSERVER_Message* model_config)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  Status status = tm->UpdateModelConfig(config_version, model_config);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelServer(
+    TRITONBACKEND_Model* model, TRITONSERVER_Server** server)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *server = reinterpret_cast<TRITONSERVER_Server*>(tm->Server());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelBackend(
+    TRITONBACKEND_Model* model, TRITONBACKEND_Backend** backend)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *backend = reinterpret_cast<TRITONBACKEND_Backend*>(tm->Backend().get());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelState(TRITONBACKEND_Model* model, void** state)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  *state = tm->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelSetState(TRITONBACKEND_Model* model, void* state)
+{
+  TritonModel* tm = reinterpret_cast<TritonModel*>(model);
+  tm->SetState(state);
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Request
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestId(TRITONBACKEND_Request* request, const char** id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *id = tr->Id().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestCorrelationId(TRITONBACKEND_Request* request, uint64_t* id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
+  if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::UINT64) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "correlation ID in request is not an unsigned int")
+            .c_str());
+  }
+  *id = correlation_id.UnsignedIntValue();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestFlags(TRITONBACKEND_Request* request, uint32_t* flags)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *flags = tr->Flags();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestCorrelationIdString(
+    TRITONBACKEND_Request* request, const char** id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const InferenceRequest::SequenceId& correlation_id = tr->CorrelationId();
+  if (correlation_id.Type() != InferenceRequest::SequenceId::DataType::STRING) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "correlation ID in request is not a string")
+            .c_str());
+  }
+  *id = correlation_id.StringValue().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputCount(TRITONBACKEND_Request* request, uint32_t* count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *count = tr->ImmutableInputs().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** input_name)
+{
+  *input_name = nullptr;
+
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  if (index >= inputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(inputs.size()) + " inputs")
+            .c_str());
+  }
+
+  // The request inputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // map. This linear search is the best we can do given the need for
+  // the inputs to be in a map and given the typical small number of
+  // inputs is better than having every request maintain the inputs as
+  // both map and vector.
+  uint32_t cnt = 0;
+  for (const auto& pr : inputs) {
+    if (cnt++ == index) {
+      InferenceRequest::Input* in = pr.second;
+      *input_name = in->Name().c_str();
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInput(
+    TRITONBACKEND_Request* request, const char* name,
+    TRITONBACKEND_Input** input)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  const auto& itr = inputs.find(name);
+  if (itr == inputs.end()) {
+    *input = nullptr;
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "unknown request input name " + name).c_str());
+  }
+
+  InferenceRequest::Input* in = itr->second;
+  *input = reinterpret_cast<TRITONBACKEND_Input*>(in);
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestInputByIndex(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    TRITONBACKEND_Input** input)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& inputs = tr->ImmutableInputs();
+  if (index >= inputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(inputs.size()) + " inputs")
+            .c_str());
+  }
+
+  // The request inputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // map. This linear search is the best we can do given the need for
+  // the inputs to be in a map and given the typical small number of
+  // inputs is better than having every request maintain the inputs as
+  // both map and vector.
+  uint32_t cnt = 0;
+  for (const auto& pr : inputs) {
+    if (cnt++ == index) {
+      InferenceRequest::Input* in = pr.second;
+      *input = reinterpret_cast<TRITONBACKEND_Input*>(in);
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputCount(
+    TRITONBACKEND_Request* request, uint32_t* count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  *count = tr->ImmutableRequestedOutputs().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputName(
+    TRITONBACKEND_Request* request, const uint32_t index,
+    const char** output_name)
+{
+  *output_name = nullptr;
+
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  const auto& routputs = tr->ImmutableRequestedOutputs();
+  if (index >= routputs.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (tr->LogRequest() + "out of bounds index " + std::to_string(index) +
+         ": request has " + std::to_string(routputs.size()) +
+         " requested outputs")
+            .c_str());
+  }
+
+  // The requested outputs are not allowed to change once the request
+  // makes it to the backend, so it is ok to just iterate through the
+  // set. This linear search is the best we can do given the requested
+  // outputs being in a set and given the typical small number of
+  // requested outputs it should not be a performance issue.
+  uint32_t cnt = 0;
+  for (const auto& rout : routputs) {
+    if (cnt++ == index) {
+      *output_name = rout.c_str();
+      break;
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestOutputBufferProperties(
+    TRITONBACKEND_Request* request, const char* name, size_t* byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  auto status =
+      tr->OutputBufferProperties(name, byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_RequestRelease(
+    TRITONBACKEND_Request* request, uint32_t release_flags)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  std::unique_ptr<InferenceRequest> ur(tr);
+  InferenceRequest::Release(std::move(ur), release_flags);
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_State
+///
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateUpdate(TRITONBACKEND_State* state)
+{
+  SequenceState* ts = reinterpret_cast<SequenceState*>(state);
+  auto status = ts->Update();
+
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateNew(
+    TRITONBACKEND_State** state, TRITONBACKEND_Request* request,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  SequenceState* lstate;
+  std::vector<int64_t> lshape(shape, shape + dims_count);
+  auto& sequence_state = tr->GetSequenceStates();
+
+  if (sequence_state == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("unable to add state '") + name +
+         "'. State configuration is missing for model '" + tr->ModelName() +
+         "'.")
+            .c_str());
+  }
+
+  Status status = sequence_state->OutputState(
+      name, TritonToDataType(datatype), lshape, &lstate);
+  if (!status.IsOk()) {
+    *state = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *state = reinterpret_cast<TRITONBACKEND_State*>(lstate);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateBuffer(
+    TRITONBACKEND_State* state, void** buffer, const uint64_t buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  SequenceState* to = reinterpret_cast<SequenceState*>(state);
+  Status status = Status::Success;
+
+  // If the buffer size exactly matches the buffer available, reuse the
+  // currently allocated buffer.
+  if (to->Data()->TotalByteSize() == buffer_byte_size) {
+    const std::shared_ptr<AllocatedMemory>& memory =
+        reinterpret_cast<const std::shared_ptr<AllocatedMemory>&>(to->Data());
+
+    TRITONSERVER_MemoryType current_memory_type;
+    int64_t current_memory_type_id;
+    void* lbuffer =
+        memory->MutableBuffer(&current_memory_type, &current_memory_type_id);
+
+    // If the requested memory type doesn't match the current buffer, allocate a
+    // new buffer with the requested memory type and memory type id.
+    if (current_memory_type == *memory_type &&
+        current_memory_type_id == *memory_type_id) {
+      *buffer = lbuffer;
+    } else {
+      std::shared_ptr<AllocatedMemory> memory =
+          std::make_shared<AllocatedMemory>(
+              buffer_byte_size, *memory_type, *memory_type_id);
+      *buffer = memory->MutableBuffer(memory_type, memory_type_id);
+      to->RemoveAllData();
+      status = to->SetData(memory);
+    }
+  } else {
+    std::shared_ptr<AllocatedMemory> memory = std::make_shared<AllocatedMemory>(
+        buffer_byte_size, *memory_type, *memory_type_id);
+    *buffer = memory->MutableBuffer(memory_type, memory_type_id);
+    to->RemoveAllData();
+    status = to->SetData(memory);
+  }
+
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_StateBufferAttributes(
+    TRITONBACKEND_State* state,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  SequenceState* to = reinterpret_cast<SequenceState*>(state);
+  to->Data()->BufferAt(
+      0, reinterpret_cast<BufferAttributes**>(buffer_attributes));
+
+  return nullptr;  // success
+}
+
+//
+// TRITONBACKEND_ResponseFactory
+//
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactoryNew(
+    TRITONBACKEND_ResponseFactory** factory, TRITONBACKEND_Request* request)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      new std::shared_ptr<InferenceResponseFactory>(tr->ResponseFactory());
+
+  *factory = reinterpret_cast<TRITONBACKEND_ResponseFactory*>(response_factory);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactoryDelete(TRITONBACKEND_ResponseFactory* factory)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+  delete response_factory;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseFactorySendFlags(
+    TRITONBACKEND_ResponseFactory* factory, const uint32_t send_flags)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+  Status status = (*response_factory)->SendFlags(send_flags);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Response
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseNew(
+    TRITONBACKEND_Response** response, TRITONBACKEND_Request* request)
+{
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+
+  std::unique_ptr<InferenceResponse> tresp;
+  Status status = tr->ResponseFactory()->CreateResponse(&tresp);
+  if (!status.IsOk()) {
+    *response = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *response = reinterpret_cast<TRITONBACKEND_Response*>(tresp.release());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseNewFromFactory(
+    TRITONBACKEND_Response** response, TRITONBACKEND_ResponseFactory* factory)
+{
+  std::shared_ptr<InferenceResponseFactory>* response_factory =
+      reinterpret_cast<std::shared_ptr<InferenceResponseFactory>*>(factory);
+
+  std::unique_ptr<InferenceResponse> tr;
+  Status status = (*response_factory)->CreateResponse(&tr);
+  if (!status.IsOk()) {
+    *response = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *response = reinterpret_cast<TRITONBACKEND_Response*>(tr.release());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseDelete(TRITONBACKEND_Response* response)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  delete tr;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetStringParameter(
+    TRITONBACKEND_Response* response, const char* name, const char* value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetIntParameter(
+    TRITONBACKEND_Response* response, const char* name, const int64_t value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSetBoolParameter(
+    TRITONBACKEND_Response* response, const char* name, const bool value)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  Status status = tr->AddParameter(name, value);
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseOutput(
+    TRITONBACKEND_Response* response, TRITONBACKEND_Output** output,
+    const char* name, const TRITONSERVER_DataType datatype,
+    const int64_t* shape, const uint32_t dims_count)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+  std::vector<int64_t> lshape(shape, shape + dims_count);
+  InferenceResponse::Output* loutput;
+  Status status = tr->AddOutput(
+      name, TritonToDataType(datatype), std::move(lshape), &loutput);
+  if (!status.IsOk()) {
+    *output = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  *output = reinterpret_cast<TRITONBACKEND_Output*>(loutput);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ResponseSend(
+    TRITONBACKEND_Response* response, const uint32_t send_flags,
+    TRITONSERVER_Error* error)
+{
+  InferenceResponse* tr = reinterpret_cast<InferenceResponse*>(response);
+
+  Status status;
+
+  std::unique_ptr<InferenceResponse> utr(tr);
+  if (error == nullptr) {
+    status = InferenceResponse::Send(std::move(utr), send_flags);
+  } else {
+    status = InferenceResponse::SendWithStatus(
+        std::move(utr), send_flags,
+        Status(
+            TritonCodeToStatusCode(TRITONSERVER_ErrorCode(error)),
+            TRITONSERVER_ErrorMessage(error)));
+  }
+
+  if (!status.IsOk()) {
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Input
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputProperties(
+    TRITONBACKEND_Input* input, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  if (name != nullptr) {
+    *name = ti->Name().c_str();
+  }
+  if (datatype != nullptr) {
+    *datatype = DataTypeToTriton(ti->DType());
+  }
+  if (shape != nullptr) {
+    *shape = ti->ShapeWithBatchDim().data();
+  }
+  if (dims_count != nullptr) {
+    *dims_count = ti->ShapeWithBatchDim().size();
+  }
+  if (byte_size != nullptr) {
+    *byte_size = ti->Data()->TotalByteSize();
+  }
+  if (buffer_count != nullptr) {
+    *buffer_count = ti->DataBufferCount();
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputPropertiesForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name, const char** name,
+    TRITONSERVER_DataType* datatype, const int64_t** shape,
+    uint32_t* dims_count, uint64_t* byte_size, uint32_t* buffer_count)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  if (name != nullptr) {
+    *name = ti->Name().c_str();
+  }
+  if (datatype != nullptr) {
+    *datatype = DataTypeToTriton(ti->DType());
+  }
+  if (shape != nullptr) {
+    *shape = ti->ShapeWithBatchDim().data();
+  }
+  if (dims_count != nullptr) {
+    *dims_count = ti->ShapeWithBatchDim().size();
+  }
+  if (host_policy_name != nullptr) {
+    if (byte_size != nullptr) {
+      *byte_size = ti->Data(host_policy_name)->TotalByteSize();
+    }
+    if (buffer_count != nullptr) {
+      *buffer_count = ti->DataBufferCountForHostPolicy(host_policy_name);
+    }
+  } else {
+    if (byte_size != nullptr) {
+      *byte_size = ti->Data()->TotalByteSize();
+    }
+    if (buffer_count != nullptr) {
+      *buffer_count = ti->DataBufferCount();
+    }
+  }
+  return nullptr;  // success
+}
+
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBuffer(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    uint64_t* buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  Status status = ti->DataBuffer(
+      index, buffer, buffer_byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_byte_size = 0;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBufferAttributes(
+    TRITONBACKEND_Input* input, const uint32_t index, const void** buffer,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+  Status status = ti->DataBufferAttributes(
+      index, buffer, reinterpret_cast<BufferAttributes**>(buffer_attributes));
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_attributes = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_InputBufferForHostPolicy(
+    TRITONBACKEND_Input* input, const char* host_policy_name,
+    const uint32_t index, const void** buffer, uint64_t* buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  InferenceRequest::Input* ti =
+      reinterpret_cast<InferenceRequest::Input*>(input);
+
+  Status status =
+      (host_policy_name == nullptr)
+          ? ti->DataBuffer(
+                index, buffer, buffer_byte_size, memory_type, memory_type_id)
+          : ti->DataBufferForHostPolicy(
+                index, buffer, buffer_byte_size, memory_type, memory_type_id,
+                host_policy_name);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    *buffer_byte_size = 0;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+///
+/// TRITONBACKEND_Output
+///
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_OutputBuffer(
+    TRITONBACKEND_Output* output, void** buffer,
+    const uint64_t buffer_byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id)
+{
+  InferenceResponse::Output* to =
+      reinterpret_cast<InferenceResponse::Output*>(output);
+  Status status = to->AllocateDataBuffer(
+      buffer, buffer_byte_size, memory_type, memory_type_id);
+  if (!status.IsOk()) {
+    *buffer = nullptr;
+    return TRITONSERVER_ErrorNew(
+        StatusCodeToTritonCode(status.StatusCode()), status.Message().c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_OutputBufferAttributes(
+    TRITONBACKEND_Output* output,
+    TRITONSERVER_BufferAttributes** buffer_attributes)
+{
+  InferenceResponse::Output* to =
+      reinterpret_cast<InferenceResponse::Output*>(output);
+
+  *buffer_attributes = reinterpret_cast<TRITONSERVER_BufferAttributes*>(
+      to->GetBufferAttributes());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup(
+    TRITONBACKEND_BackendAttribute* backend_attributes,
+    const TRITONSERVER_InstanceGroupKind kind, const uint64_t count,
+    const uint64_t* device_ids, const uint64_t id_count)
+{
+  auto ba = reinterpret_cast<TritonBackend::Attribute*>(backend_attributes);
+  ba->preferred_groups_.emplace_back();
+  auto& pg = ba->preferred_groups_.back();
+  switch (kind) {
+    case TRITONSERVER_INSTANCEGROUPKIND_AUTO:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_AUTO);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_CPU:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_CPU);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_GPU:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_GPU);
+      break;
+    case TRITONSERVER_INSTANCEGROUPKIND_MODEL:
+      pg.set_kind(inference::ModelInstanceGroup::KIND_MODEL);
+      break;
+  }
+  pg.set_count(count);
+  if (device_ids != nullptr) {
+    for (size_t i = 0; i < id_count; ++i) {
+      pg.add_gpus(device_ids[i]);
+    }
+  }
+  return nullptr;
+}
+
+}  // extern C
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model.h
+++ b/3rdparty/core-r22.12/src/backend_model.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+#include <string>
+#include "backend_manager.h"
+#include "filesystem.h"
+#include "infer_request.h"
+#include "model.h"
+#include "model_config.pb.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+class InferenceServer;
+class TritonModelInstance;
+
+//
+// Represents a model.
+//
+// Inheriting from Model to implement backend APIs
+//
+class TritonModel : public Model {
+ public:
+  static Status Create(
+      InferenceServer* server, const std::string& model_path,
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+      const std::string& model_name, const int64_t version,
+      inference::ModelConfig model_config, const bool is_config_provided,
+      std::unique_ptr<TritonModel>* model);
+  ~TritonModel();
+
+  const std::string& LocalizedModelPath() const
+  {
+    return localized_model_dir_->Path();
+  }
+  InferenceServer* Server() { return server_; }
+  bool AutoCompleteConfig() const { return auto_complete_config_; }
+  Status UpdateModelConfig(
+      const uint32_t config_version,
+      TRITONSERVER_Message* updated_config_message);
+  const std::shared_ptr<TritonBackend>& Backend() const { return backend_; }
+  const std::vector<std::unique_ptr<TritonModelInstance>>& Instances() const
+  {
+    return instances_;
+  }
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+  Status AddInstance(
+      std::unique_ptr<TritonModelInstance>&& instance, const bool passive);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonModel);
+
+  TritonModel(
+      InferenceServer* server,
+      const std::shared_ptr<LocalizedPath>& localized_model_dir,
+      const std::shared_ptr<TritonBackend>& backend,
+      const double min_compute_capability, const int64_t version,
+      const inference::ModelConfig& config, const bool auto_complete_config);
+
+  // Set the scheduler based on the model configuration. The scheduler
+  // can only be set once for a backend.
+  Status SetConfiguredScheduler();
+
+  // Merges the global backend configs with the specific
+  // backend configs.
+  static Status ResolveBackendConfigs(
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const std::string& backend_name,
+      triton::common::BackendCmdlineConfig& config);
+
+  // Sets defaults for some backend configurations when none are specified on
+  // the command line.
+  static Status SetBackendConfigDefaults(
+      triton::common::BackendCmdlineConfig& config);
+
+  Status Initialize();
+  Status WarmUp();
+
+  // The server object that owns this model. The model holds this as a
+  // raw pointer because the lifetime of the server is guaranteed to
+  // be longer than the lifetime of a model owned by the server.
+  InferenceServer* server_;
+
+  // The minimum supported compute capability on device.
+  const double min_compute_capability_;
+
+  // Whether the backend should attempt to auto-complete the model config.
+  const bool auto_complete_config_;
+
+  // The localized repo directory holding the model. If localization
+  // required creation of a temporary local copy then that copy will
+  // persist as along as this object is retained by this model.
+  std::shared_ptr<LocalizedPath> localized_model_dir_;
+
+  // Backend used by this model.
+  std::shared_ptr<TritonBackend> backend_;
+
+  // The model instances for this model.
+  std::vector<std::unique_ptr<TritonModelInstance>> instances_;
+  std::vector<std::unique_ptr<TritonModelInstance>> passive_instances_;
+
+  // Opaque state associated with this model.
+  void* state_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model_instance.cc
+++ b/3rdparty/core-r22.12/src/backend_model_instance.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "backend_model_instance.h"
+
+#ifndef _WIN32
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#include "backend_config.h"
+#include "backend_model.h"
+#include "cuda_utils.h"
+#include "metrics.h"
+#include "model_config.pb.h"
+#include "numa_utils.h"
+#include "server.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+#include "triton/common/nvtx.h"
+#include "tritonserver_apis.h"
+
+// For unknown reason, windows will not export the TRITONBACKEND_*
+// functions declared with dllexport in tritonbackend.h. To get those
+// functions exported it is (also?) necessary to mark the definitions
+// in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+namespace {
+// Utilities for warmup feature
+TRITONSERVER_Error*
+WarmupResponseAlloc(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
+    int64_t preferred_memory_type_id, void* userp, void** buffer,
+    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id)
+{
+  *buffer = malloc(byte_size);
+  if (*buffer != nullptr) {
+    *actual_memory_type = TRITONSERVER_MEMORY_CPU;
+    *actual_memory_type_id = 0;
+    return nullptr;
+  }
+
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_INTERNAL,
+      "failed to allocate output buffer for warmup.");
+}
+
+TRITONSERVER_Error*
+WarmupResponseRelease(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  free(buffer);
+  return nullptr;
+}
+
+ResponseAllocator warmup_allocator = ResponseAllocator(
+    WarmupResponseAlloc, WarmupResponseRelease, nullptr /* start_fn */);
+
+void
+WarmupResponseComplete(
+    TRITONSERVER_InferenceResponse* iresponse, const uint32_t flags,
+    void* userp)
+{
+  auto res_pair = reinterpret_cast<
+      std::pair<std::promise<void>, std::vector<std::string>*>*>(userp);
+  if (iresponse != nullptr) {
+    auto err = TRITONSERVER_InferenceResponseError(iresponse);
+    if (err != nullptr) {
+      // The error vector is shared by all requests in the batch for now
+      static std::mutex res_mtx;
+      {
+        std::lock_guard<std::mutex> lk(res_mtx);
+        res_pair->second->emplace_back(TRITONSERVER_ErrorMessage(err));
+      }
+      TRITONSERVER_ErrorDelete(err);
+    }
+    // Just delete the response, warmup doesn't check for correctness
+    LOG_TRITONSERVER_ERROR(
+        TRITONSERVER_InferenceResponseDelete(iresponse),
+        "deleting warmup response");
+  }
+  // Last response
+  if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) {
+    res_pair->first.set_value();
+  }
+}
+
+void
+WarmupRequestComplete(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
+{
+  if ((flags & TRITONSERVER_REQUEST_RELEASE_ALL) != 0) {
+    // Don't need to release request here, it is managed in WarmupData
+    if (userp != nullptr) {
+      auto warmup_promise = reinterpret_cast<std::promise<void>*>(userp);
+      warmup_promise->set_value();
+    }
+  }
+}
+
+}  // namespace
+
+TritonModelInstance::TritonModelInstance(
+    TritonModel* model, const std::string& name, const size_t index,
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const std::vector<std::string>& profile_names, const bool passive,
+    const triton::common::HostPolicyCmdlineConfig& host_policy,
+    const TritonServerMessage& host_policy_message,
+    const std::vector<SecondaryDevice>& secondary_devices)
+    : model_(model), name_(name), index_(index), kind_(kind),
+      device_id_(device_id), host_policy_(host_policy),
+      host_policy_message_(host_policy_message), profile_names_(profile_names),
+      passive_(passive), secondary_devices_(secondary_devices), state_(nullptr)
+{
+#ifdef TRITON_ENABLE_METRICS
+  if (Metrics::Enabled()) {
+    // Use an ID in the metric only for GPU instances. Otherwise use
+    // METRIC_REPORTER_ID_CPU to indicate no device should be reported in the
+    // metric.
+    const int id = (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU)
+                       ? device_id_
+                       : METRIC_REPORTER_ID_CPU;
+    MetricModelReporter::Create(
+        model_->Name(), model_->Version(), id, model_->Config().metric_tags(),
+        &reporter_);
+  }
+#endif  // TRITON_ENABLE_METRICS
+}
+
+TritonModelInstance::~TritonModelInstance()
+{
+  if (triton_backend_thread_.get() != nullptr) {
+    triton_backend_thread_->StopBackendThread();
+  }
+
+  // Model finalization is optional...
+  if (model_->Backend()->ModelInstanceFiniFn() != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        model_->Backend()->ModelInstanceFiniFn()(
+            reinterpret_cast<TRITONBACKEND_ModelInstance*>(this)),
+        "failed finalizing model instance");
+  }
+}
+
+Status
+TritonModelInstance::CreateInstances(
+    TritonModel* model,
+    const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+    const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+    const inference::ModelConfig& model_config, const bool device_blocking)
+{
+  static triton::common::HostPolicyCmdlineConfig empty_host_policy;
+
+  // This structure is used to allocate TritonBackendThread to instances on same
+  // device for device blocking execution policy.
+  std::map<uint32_t, std::shared_ptr<TritonBackendThread>> device_to_thread_map;
+
+  for (const auto& group : model_config.instance_group()) {
+    std::vector<std::string> profile_names;
+    for (const auto& profile_name : group.profile()) {
+      profile_names.push_back(profile_name);
+    }
+    std::vector<SecondaryDevice> secondary_devices;
+    for (const auto& secondary_device : group.secondary_devices()) {
+      secondary_devices.emplace_back(
+          inference::
+              ModelInstanceGroup_SecondaryDevice_SecondaryDeviceKind_Name(
+                  secondary_device.kind()),
+          secondary_device.device_id());
+    }
+    for (int32_t c = 0; c < group.count(); ++c) {
+      std::string instance_name{group.count() > 1
+                                    ? group.name() + "_" + std::to_string(c)
+                                    : group.name()};
+      const bool passive = group.passive();
+      std::vector<std::tuple<
+          std::string, TRITONSERVER_InstanceGroupKind, int32_t,
+          const inference::ModelRateLimiter*>>
+          instance_setting;
+      if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
+        instance_setting.emplace_back(
+            group.host_policy().empty() ? "cpu" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_CPU, 0 /* device_id */,
+            &group.rate_limiter());
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
+        for (const int32_t device_id : group.gpus()) {
+          instance_setting.emplace_back(
+              group.host_policy().empty() ? ("gpu_" + std::to_string(device_id))
+                                          : group.host_policy(),
+              TRITONSERVER_INSTANCEGROUPKIND_GPU, device_id,
+              &group.rate_limiter());
+        }
+      } else if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
+        instance_setting.emplace_back(
+            group.host_policy().empty() ? "model" : group.host_policy(),
+            TRITONSERVER_INSTANCEGROUPKIND_MODEL, 0 /* device_id */,
+            &group.rate_limiter());
+      } else {
+        return Status(
+            Status::Code::INVALID_ARG,
+            std::string("instance_group kind ") +
+                ModelInstanceGroup_Kind_Name(group.kind()) + " not supported");
+      }
+      for (const auto is : instance_setting) {
+        const auto& kind = std::get<1>(is);
+        const auto& id = std::get<2>(is);
+
+        const std::string& policy_name = std::get<0>(is);
+        const triton::common::HostPolicyCmdlineConfig* host_policy;
+        const auto policy_it = host_policy_map.find(policy_name);
+        if (policy_it != host_policy_map.end()) {
+          host_policy = &policy_it->second;
+        } else {
+          host_policy = &empty_host_policy;
+        }
+        RETURN_IF_ERROR(SetNumaConfigOnThread(*host_policy));
+        auto err = CreateInstance(
+            model, instance_name, c, kind, id, profile_names, passive,
+            policy_name, *host_policy, *(std::get<3>(is)), device_blocking,
+            &device_to_thread_map, secondary_devices);
+        RETURN_IF_ERROR(ResetNumaMemoryPolicy());
+        RETURN_IF_ERROR(err);
+
+        // When deploying on GPU, we want to make sure the GPU memory usage
+        // is within allowed range, otherwise, stop the creation to ensure
+        // there is sufficient GPU memory for other use.
+        // We check the usage after loading the instance to better enforcing
+        // the limit. If we check before loading, we may create instance
+        // that occupies the rest of available memory which against the purpose
+        if (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+          size_t free, total;
+          double memory_limit;
+          RETURN_IF_ERROR(GetDeviceMemoryInfo(id, &free, &total));
+          RETURN_IF_ERROR(BackendConfigurationModelLoadGpuFraction(
+              backend_cmdline_config_map, id, &memory_limit));
+          const size_t allow = total * memory_limit;
+          const size_t used = total - free;
+          if (used > allow) {
+            return Status(
+                Status::Code::UNAVAILABLE,
+                std::string("can not create model '") + instance_name +
+                    "': memory limit set for " +
+                    TRITONSERVER_InstanceGroupKindString(kind) + " " +
+                    std::to_string(id) +
+                    " has exceeded, model loading is rejected.");
+          }
+        }
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::CreateInstance(
+    TritonModel* model, const std::string& name, const size_t index,
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const std::vector<std::string>& profile_names, const bool passive,
+    const std::string& host_policy_name,
+    const triton::common::HostPolicyCmdlineConfig& host_policy,
+    const inference::ModelRateLimiter& rate_limiter_config,
+    const bool device_blocking,
+    std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+        device_to_thread_map,
+    const std::vector<SecondaryDevice>& secondary_devices)
+{
+  // Create the JSON representation of the backend configuration.
+  triton::common::TritonJson::Value host_policy_json(
+      triton::common::TritonJson::ValueType::OBJECT);
+  triton::common::TritonJson::Value policy_setting_json(
+      host_policy_json, triton::common::TritonJson::ValueType::OBJECT);
+  for (const auto& pr : host_policy) {
+    RETURN_IF_ERROR(policy_setting_json.AddString(pr.first.c_str(), pr.second));
+  }
+
+  RETURN_IF_ERROR(host_policy_json.Add(
+      host_policy_name.c_str(), std::move(policy_setting_json)));
+  TritonServerMessage host_policy_message(host_policy_json);
+
+  std::unique_ptr<TritonModelInstance> local_instance(new TritonModelInstance(
+      model, name, index, kind, device_id, profile_names, passive, host_policy,
+      host_policy_message, secondary_devices));
+
+  TRITONBACKEND_ModelInstance* triton_instance =
+      reinterpret_cast<TRITONBACKEND_ModelInstance*>(local_instance.get());
+
+  // Instance initialization is optional... We must set set shared
+  // library path to point to the backend directory in case the
+  // backend library attempts to load additional shared libaries.
+  if (model->Backend()->ModelInstanceInitFn() != nullptr) {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+    RETURN_IF_ERROR(slib->SetLibraryDirectory(model->Backend()->Directory()));
+
+    TRITONSERVER_Error* err =
+        model->Backend()->ModelInstanceInitFn()(triton_instance);
+
+    RETURN_IF_ERROR(slib->ResetLibraryDirectory());
+    RETURN_IF_TRITONSERVER_ERROR(err);
+  }
+
+  if (!passive) {
+    RETURN_IF_ERROR(local_instance->GenerateWarmupData());
+    RETURN_IF_ERROR(model->Server()->GetRateLimiter()->RegisterModelInstance(
+        local_instance.get(), rate_limiter_config));
+    RETURN_IF_ERROR(local_instance->SetBackendThread(
+        kind, device_id, device_blocking, device_to_thread_map));
+  }
+
+  RETURN_IF_ERROR(model->AddInstance(std::move(local_instance), passive));
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::SetBackendThread(
+    const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+    const bool device_blocking,
+    std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+        device_to_thread_map)
+{
+  if (device_blocking && (kind == TRITONSERVER_INSTANCEGROUPKIND_GPU)) {
+    auto thread_it = device_to_thread_map->find(device_id);
+    if (thread_it != device_to_thread_map->end()) {
+      LOG_VERBOSE(1) << "Using already started backend thread for " << Name()
+                     << " on device " << device_id;
+      triton_backend_thread_ = thread_it->second;
+    }
+  }
+  if (triton_backend_thread_.get() == nullptr) {
+    std::unique_ptr<TritonBackendThread> local_backend_thread;
+    RETURN_IF_ERROR(TritonBackendThread::CreateBackendThread(
+        Name(), this, 0 /* nice */, device_id, &local_backend_thread));
+    triton_backend_thread_ = std::move(local_backend_thread);
+    device_to_thread_map->insert({device_id, triton_backend_thread_});
+  } else {
+    triton_backend_thread_->AddModelInstance(this);
+  }
+  RETURN_IF_ERROR(triton_backend_thread_->InitAndWarmUpModelInstance(this));
+
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::GenerateWarmupData()
+{
+  warmup_samples_.clear();
+  for (const auto& warmup_setting : model_->Config().model_warmup()) {
+    if (warmup_setting.batch_size() == 0) {
+      LOG_VERBOSE(1) << "Skipping batch 0 warmup sample '"
+                     << warmup_setting.name() << "'";
+      continue;
+    }
+    LOG_VERBOSE(1) << "Generating warmup sample data for '"
+                   << warmup_setting.name() << "'";
+
+    // Two passes. First pass to get max byte size for synthetic
+    // data. Second pass to add original inputs and override inputs
+    // for control inputs.
+    int64_t max_zero_byte_size = 0;
+    int64_t max_random_byte_size = 0;
+    for (const auto& input_meta : warmup_setting.inputs()) {
+      auto element_count =
+          triton::common::GetElementCount(input_meta.second.dims());
+      if (element_count == -1) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "warmup setting expects all variable-size dimensions are specified "
+            "for input '" +
+                input_meta.first + "'");
+      }
+
+      int64_t batch_byte_size =
+          element_count *
+          triton::common::GetDataTypeByteSize(input_meta.second.data_type());
+      if (batch_byte_size == 0) {
+        batch_byte_size = element_count * sizeof(int32_t);
+      }
+
+      switch (input_meta.second.input_data_type_case()) {
+        case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
+          max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
+          break;
+        case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
+          // Because Triton expects STRING type to be in special format
+          // (prepend 4 bytes to specify string length), so using zero data
+          // for simplicity (4 bytes * element count of zeros).
+          if (input_meta.second.data_type() ==
+              inference::DataType::TYPE_STRING) {
+            max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size);
+          } else {
+            max_random_byte_size =
+                std::max(batch_byte_size, max_random_byte_size);
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    warmup_samples_.emplace_back(warmup_setting.name(), warmup_setting.count());
+    auto& warmup_data = warmup_samples_.back();
+    // Create buffers for synthetic data
+    TRITONSERVER_MemoryType type;
+    int64_t type_id;
+    warmup_data.zero_data_.reset(new AllocatedMemory(
+        max_zero_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
+        0 /* memory_type_id */));
+    char* zero_buffer = warmup_data.zero_data_->MutableBuffer(&type, &type_id);
+    memset(zero_buffer, 0, max_zero_byte_size);
+
+    warmup_data.random_data_.reset(new AllocatedMemory(
+        max_random_byte_size, TRITONSERVER_MEMORY_CPU_PINNED /* memory_type */,
+        0 /* memory_type_id */));
+    char* random_buffer =
+        warmup_data.random_data_->MutableBuffer(&type, &type_id);
+    for (int64_t offset = 0; offset < max_random_byte_size; offset++) {
+      random_buffer[offset] = rand();
+    }
+
+    // Prepare the inference request for the specified sample, not using
+    // in-process C API because the request doesn't go through the same pipeline
+    // (i.e. no normalization / scheduler) so we need to prepare the request to
+    // the state just before calling instance execute function.
+    for (size_t cnt = 0; cnt < warmup_setting.batch_size(); cnt++) {
+      warmup_data.requests_.emplace_back(
+          new InferenceRequest(model_, model_->Version()));
+      auto& lrequest = warmup_data.requests_.back();
+
+      // Second pass to prepare original inputs.
+      std::vector<std::shared_ptr<InferenceRequest::Input>> input_sps;
+      for (const auto& input_meta : warmup_setting.inputs()) {
+        auto batch1_element_count =
+            triton::common::GetElementCount(input_meta.second.dims());
+        auto batch_byte_size =
+            batch1_element_count *
+            triton::common::GetDataTypeByteSize(input_meta.second.data_type());
+        if (batch_byte_size == 0) {
+          batch_byte_size = batch1_element_count * sizeof(int32_t);
+        }
+
+        const char* allocated_ptr;
+        switch (input_meta.second.input_data_type_case()) {
+          case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData:
+            allocated_ptr = zero_buffer;
+            break;
+          case inference::ModelWarmup_Input::InputDataTypeCase::kRandomData: {
+            if (input_meta.second.data_type() ==
+                inference::DataType::TYPE_STRING) {
+              allocated_ptr = zero_buffer;
+            } else {
+              allocated_ptr = random_buffer;
+            }
+            break;
+          }
+          case inference::ModelWarmup_Input::InputDataTypeCase::
+              kInputDataFile: {
+            // For data provided from file, we can set buffer in first pass
+            warmup_data.provided_data_.emplace_back(new std::string());
+            auto input_data = warmup_data.provided_data_.back().get();
+            RETURN_IF_ERROR(ReadTextFile(
+                JoinPath({model_->LocalizedModelPath(), kWarmupDataFolder,
+                          input_meta.second.input_data_file()}),
+                input_data));
+            if (input_meta.second.data_type() ==
+                inference::DataType::TYPE_STRING) {
+              batch_byte_size = input_data->size();
+            } else if (((size_t)batch_byte_size) > input_data->size()) {
+              return Status(
+                  Status::Code::INVALID_ARG,
+                  lrequest->LogRequest() + "warmup setting expects " +
+                      std::to_string(batch_byte_size) +
+                      " bytes, but the data "
+                      "provided from " +
+                      input_meta.second.input_data_file() + "only has " +
+                      std::to_string(input_data->size()) + " bytes");
+            }
+            allocated_ptr = input_data->data();
+            break;
+          }
+          default:
+            return Status(
+                Status::Code::INVALID_ARG,
+                lrequest->LogRequest() + "warmup setting expects input '" +
+                    input_meta.first + "' to have input_data_type set");
+        }
+
+        const inference::ModelInput* input_config;
+        bool is_original_input =
+            model_->GetInput(input_meta.first, &input_config).IsOk();
+        InferenceRequest::Input* input = nullptr;
+        std::vector<int64_t> input_meta_shape;
+        // Append batch size only if the model supports batching
+        // and not control inpt.
+        if ((model_->Config().max_batch_size() != 0) && is_original_input) {
+          input_meta_shape.push_back(1);
+        }
+        for (auto d : input_meta.second.dims()) {
+          input_meta_shape.push_back(d);
+        }
+        if (is_original_input) {
+          RETURN_IF_ERROR(lrequest->AddOriginalInput(
+              input_meta.first, input_meta.second.data_type(), input_meta_shape,
+              &input));
+        } else {
+          input_sps.emplace_back();
+          RETURN_IF_ERROR(lrequest->AddOverrideInput(
+              input_meta.first, input_meta.second.data_type(),
+              (model_->Config().max_batch_size() != 0 ? 1 : 0),
+              input_meta_shape, &input_sps.back()));
+          input = input_sps.back().get();
+        }
+        RETURN_IF_ERROR(input->AppendData(
+            allocated_ptr, batch_byte_size,
+            TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */));
+      }
+
+      RETURN_IF_ERROR(lrequest->PrepareForInference());
+      // Override inputs must be added after PrepareForInference() is called
+      for (const auto& sp : input_sps) {
+        RETURN_IF_ERROR(lrequest->AddOverrideInput(sp));
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::Schedule(
+    std::vector<std::unique_ptr<InferenceRequest>>&& requests,
+    const std::function<void()>& OnCompletion)
+{
+  // Use a thread local vector to avoid needing to malloc each
+  // time an inference is run.
+  thread_local std::vector<TRITONBACKEND_Request*> triton_requests(1024);
+  triton_requests.clear();
+  for (auto& r : requests) {
+    // Load the input states for the inference request.
+    r->LoadInputStates();
+    triton_requests.push_back(
+        reinterpret_cast<TRITONBACKEND_Request*>(r.release()));
+  }
+
+  Execute(triton_requests);
+
+  OnCompletion();
+}
+
+Status
+TritonModelInstance::Initialize()
+{
+  RETURN_IF_ERROR(SetNumaConfigOnThread(HostPolicy()));
+  return Status::Success;
+}
+
+Status
+TritonModelInstance::WarmUp()
+{
+  // move samples to local variable for scoped cleanup
+  std::vector<triton::core::TritonModelInstance::WarmupData> lwarmup_samples;
+  lwarmup_samples.swap(warmup_samples_);
+
+  for (auto& sample : lwarmup_samples) {
+    for (size_t iteration = 1; iteration <= sample.count_; ++iteration) {
+      LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
+                     << "' instance " << Name() << " is running warmup sample '"
+                     << sample.sample_name_ << "' for iteration " << iteration;
+
+      // request/response complete is asynchronous so use promise to wait for
+      // completion. Also collects error message from the responses in a vector.
+      std::vector<std::promise<void>> request_complete(sample.requests_.size());
+      std::vector<std::string> response_errors;
+      std::vector<std::pair<std::promise<void>, std::vector<std::string>*>>
+          response_complete(sample.requests_.size());
+
+      std::vector<TRITONBACKEND_Request*> triton_requests;
+      for (size_t i = 0; i < sample.requests_.size(); ++i) {
+        auto& request = sample.requests_[i];
+        request->SetReleaseCallback(
+            WarmupRequestComplete, &request_complete[i]);
+        response_complete[i].second = &response_errors;
+        request->SetResponseCallback(
+            &warmup_allocator, nullptr, WarmupResponseComplete,
+            &response_complete[i]);
+        // Capture timestamp before run to avoid incorrect accumulation from
+        // sequential warmup runs
+#ifdef TRITON_ENABLE_STATS
+        request->CaptureRequestStartNs();
+#endif  // TRITON_ENABLE_STATS
+        request->CaptureQueueStartNs();
+        triton_requests.push_back(
+            reinterpret_cast<TRITONBACKEND_Request*>(request.get()));
+      }
+
+      Execute(triton_requests);
+
+      // Wait for warmup sample to complete and check error
+      for (size_t i = 0; i < sample.requests_.size(); ++i) {
+        request_complete[i].get_future().get();
+        response_complete[i].first.get_future().get();
+      }
+      if (response_errors.size() != 0) {
+        std::string err_str =
+            "failed to run warmup sample '" + sample.sample_name_ + "': ";
+        for (const auto& error : response_errors) {
+          err_str += (error + "; ");
+        }
+        // End warmup as soon as there is failing sample
+        LOG_VERBOSE(1) << "model '" << sample.requests_.back()->ModelName()
+                       << "' instance " << Name()
+                       << " failed to run warmup sample '"
+                       << sample.sample_name_ << "'";
+        return Status(Status::Code::INVALID_ARG, err_str);
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::Execute(
+    std::vector<TRITONBACKEND_Request*>& triton_requests)
+{
+  TRITONBACKEND_ModelInstance* triton_model_instance =
+      reinterpret_cast<TRITONBACKEND_ModelInstance*>(this);
+  TritonBackend::TritonModelInstanceExecFn_t inst_exec_fn =
+      model_->Backend()->ModelInstanceExecFn();
+
+  // If there is an error then we retain ownership of 'requests'
+  // and must send error responses.
+  TRITONSERVER_Error* err = inst_exec_fn(
+      triton_model_instance, &triton_requests[0], triton_requests.size());
+  if (err != nullptr) {
+    Status status = Status(
+        TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
+        TRITONSERVER_ErrorMessage(err));
+    for (TRITONBACKEND_Request* tr : triton_requests) {
+      std::unique_ptr<InferenceRequest> ur(
+          reinterpret_cast<InferenceRequest*>(tr));
+      InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
+    }
+
+    TRITONSERVER_ErrorDelete(err);
+  }
+}
+
+Status
+TritonModelInstance::TritonBackendThread::CreateBackendThread(
+    const std::string name, TritonModelInstance* model_instance, const int nice,
+    const int32_t device_id,
+    std::unique_ptr<TritonBackendThread>* triton_backend_thread)
+{
+  TritonBackendThread* raw_triton_backend_thread =
+      new TritonBackendThread(name, model_instance->Model());
+  std::unique_ptr<TritonBackendThread> runner(raw_triton_backend_thread);
+
+  runner->AddModelInstance(model_instance);
+  runner->backend_thread_ =
+      std::thread([raw_triton_backend_thread, nice, device_id]() {
+        raw_triton_backend_thread->BackendThread(nice, device_id);
+      });
+
+  triton_backend_thread->reset(runner.release());
+
+  return Status::Success;
+}
+
+void
+TritonModelInstance::TritonBackendThread::AddModelInstance(
+    TritonModelInstance* model_instance)
+{
+  model_instances_.push_back(model_instance);
+}
+
+Status
+TritonModelInstance::TritonBackendThread::InitAndWarmUpModelInstance(
+    TritonModelInstance* model_instance)
+{
+  // Initialize the instance on the backend thread
+  auto init_payload = model_->Server()->GetRateLimiter()->GetPayload(
+      Payload::Operation::INIT, model_instance);
+  RETURN_IF_ERROR(
+      model_->Server()->GetRateLimiter()->EnqueuePayload(model_, init_payload));
+  RETURN_IF_ERROR(init_payload->Wait());
+
+  // Warm-up the instance on the backend thread
+  auto warmup_payload = model_->Server()->GetRateLimiter()->GetPayload(
+      Payload::Operation::WARM_UP, model_instance);
+  RETURN_IF_ERROR(model_->Server()->GetRateLimiter()->EnqueuePayload(
+      model_, warmup_payload));
+  RETURN_IF_ERROR(warmup_payload->Wait());
+
+  return Status::Success;
+}
+
+TritonModelInstance::TritonBackendThread::TritonBackendThread(
+    const std::string& name, TritonModel* model)
+    : name_(name), model_(model)
+{
+}
+
+TritonModelInstance::TritonBackendThread::~TritonBackendThread()
+{
+  StopBackendThread();
+}
+
+void
+TritonModelInstance::TritonBackendThread::StopBackendThread()
+{
+  if (backend_thread_.joinable()) {
+    // Signal the backend thread to exit and then wait for it...
+    auto exit_payload = model_->Server()->GetRateLimiter()->GetPayload(
+        Payload::Operation::EXIT, model_instances_.back());
+    model_->Server()->GetRateLimiter()->EnqueuePayload(model_, exit_payload);
+    backend_thread_.join();
+  }
+}
+
+void
+TritonModelInstance::TritonBackendThread::BackendThread(
+    const int nice, const int32_t device_id)
+{
+#ifndef _WIN32
+  if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
+    LOG_VERBOSE(1) << "Starting backend thread for " << name_ << " at nice "
+                   << nice << " on device " << device_id << "...";
+  } else {
+    LOG_VERBOSE(1) << "Starting backend thread for " << name_
+                   << " at default nice (requested nice " << nice << " failed)"
+                   << " on device " << device_id << "...";
+  }
+#else
+  LOG_VERBOSE(1) << "Starting backend thread for " << name_
+                 << " at default nice on device " << device_id << "...";
+#endif
+
+  bool should_exit = false;
+  while (!should_exit) {
+    std::shared_ptr<Payload> payload;
+    model_->Server()->GetRateLimiter()->DequeuePayload(
+        model_instances_, &payload);
+    NVTX_RANGE(nvtx_, "BackendThread " + name_);
+    payload->Execute(&should_exit);
+    model_instances_.push_back(payload->GetInstance());
+    // Release the payload to the RateLimiter
+    model_->Server()->GetRateLimiter()->PayloadRelease(payload);
+  }
+  LOG_VERBOSE(1) << "Stopping backend thread for " << name_ << "...";
+}
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceName(
+    TRITONBACKEND_ModelInstance* instance, const char** name)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *name = ti->Name().c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceKind(
+    TRITONBACKEND_ModelInstance* instance, TRITONSERVER_InstanceGroupKind* kind)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *kind = ti->Kind();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceDeviceId(
+    TRITONBACKEND_ModelInstance* instance, int32_t* device_id)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *device_id = ti->DeviceId();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceHostPolicy(
+    TRITONBACKEND_ModelInstance* instance, TRITONSERVER_Message** host_policy)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *host_policy = const_cast<TRITONSERVER_Message*>(
+      reinterpret_cast<const TRITONSERVER_Message*>(&ti->HostPolicyMessage()));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *count = ti->Profiles().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceProfileName(
+    TRITONBACKEND_ModelInstance* instance, const uint32_t index,
+    const char** profile_name)
+{
+  *profile_name = nullptr;
+
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  const auto& rprofiles = ti->Profiles();
+  if (index >= rprofiles.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("out of bounds index ") + std::to_string(index) +
+         ": instance is configured with " + std::to_string(rprofiles.size()) +
+         " profiles")
+            .c_str());
+  }
+
+  *profile_name = rprofiles[index].c_str();
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceCount(
+    TRITONBACKEND_ModelInstance* instance, uint32_t* count)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *count = ti->SecondaryDevices().size();
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSecondaryDeviceProperties(
+    TRITONBACKEND_ModelInstance* instance, uint32_t index, const char** kind,
+    int64_t* id)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  const auto& rsecondarydevices = ti->SecondaryDevices();
+
+  if (index >= rsecondarydevices.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("out of bounds index ") + std::to_string(index) +
+         ": instance is configured with " +
+         std::to_string(rsecondarydevices.size()) + " secondary devices")
+            .c_str());
+  }
+
+  *kind = rsecondarydevices[index].kind_.c_str();
+  *id = rsecondarydevices[index].id_;
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceIsPassive(
+    TRITONBACKEND_ModelInstance* instance, bool* is_passive)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *is_passive = ti->IsPassive();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceModel(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Model** model)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *model = reinterpret_cast<TRITONBACKEND_Model*>(ti->Model());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceState(
+    TRITONBACKEND_ModelInstance* instance, void** state)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  *state = ti->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceSetState(
+    TRITONBACKEND_ModelInstance* instance, void* state)
+{
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  ti->SetState(state);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportStatistics(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request* request,
+    const bool success, const uint64_t exec_start_ns,
+    const uint64_t compute_start_ns, const uint64_t compute_end_ns,
+    const uint64_t exec_end_ns)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  InferenceRequest* tr = reinterpret_cast<InferenceRequest*>(request);
+  tr->ReportStatistics(
+      ti->MetricReporter(), success, exec_start_ns, compute_start_ns,
+      compute_end_ns, exec_end_ns);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceReportBatchStatistics(
+    TRITONBACKEND_ModelInstance* instance, const uint64_t batch_size,
+    const uint64_t exec_start_ns, const uint64_t compute_start_ns,
+    const uint64_t compute_end_ns, const uint64_t exec_end_ns)
+{
+#ifdef TRITON_ENABLE_STATS
+  TritonModelInstance* ti = reinterpret_cast<TritonModelInstance*>(instance);
+  ti->Model()->MutableStatsAggregator()->UpdateInferBatchStats(
+      ti->MetricReporter(), batch_size, exec_start_ns, compute_start_ns,
+      compute_end_ns, exec_end_ns);
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+}  // extern C
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/backend_model_instance.h
+++ b/3rdparty/core-r22.12/src/backend_model_instance.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <functional>
+#include <future>
+#include <memory>
+#include <string>
+#include <thread>
+#include "constants.h"
+#include "memory.h"
+#include "metric_model_reporter.h"
+#include "model_config.pb.h"
+#include "server_message.h"
+#include "status.h"
+#include "triton/common/sync_queue.h"
+
+namespace triton { namespace core {
+
+class TritonModel;
+class InferenceRequest;
+
+//
+// Represents a model instance.
+//
+class TritonModelInstance {
+ public:
+  static Status CreateInstances(
+      TritonModel* model,
+      const triton::common::BackendCmdlineConfigMap& backend_cmdline_config_map,
+      const triton::common::HostPolicyCmdlineConfigMap& host_policy_map,
+      const inference::ModelConfig& model_config, const bool device_blocking);
+  ~TritonModelInstance();
+
+  const std::string& Name() const { return name_; }
+  size_t Index() const { return index_; }
+  TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
+  int32_t DeviceId() const { return device_id_; }
+  const triton::common::HostPolicyCmdlineConfig& HostPolicy() const
+  {
+    return host_policy_;
+  }
+  const TritonServerMessage& HostPolicyMessage() const
+  {
+    return host_policy_message_;
+  }
+  bool IsPassive() const { return passive_; }
+  const std::vector<std::string>& Profiles() const { return profile_names_; }
+
+  struct SecondaryDevice {
+    SecondaryDevice(const std::string kind, const int64_t id)
+        : kind_(kind), id_(id)
+    {
+    }
+    const std::string kind_;
+    const int64_t id_;
+  };
+  const std::vector<SecondaryDevice>& SecondaryDevices() const
+  {
+    return secondary_devices_;
+  }
+
+  Status Initialize();
+  Status WarmUp();
+  void Schedule(
+      std::vector<std::unique_ptr<InferenceRequest>>&& requests,
+      const std::function<void()>& OnCompletion);
+
+  TritonModel* Model() const { return model_; }
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  MetricModelReporter* MetricReporter() const { return reporter_.get(); }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonModelInstance);
+  class TritonBackendThread;
+  TritonModelInstance(
+      TritonModel* model, const std::string& name, const size_t index,
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const std::vector<std::string>& profile_names, const bool passive,
+      const triton::common::HostPolicyCmdlineConfig& host_policy,
+      const TritonServerMessage& host_policy_message,
+      const std::vector<SecondaryDevice>& secondary_devices);
+  static Status CreateInstance(
+      TritonModel* model, const std::string& name, const size_t index,
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const std::vector<std::string>& profile_names, const bool passive,
+      const std::string& host_policy_name,
+      const triton::common::HostPolicyCmdlineConfig& host_policy,
+      const inference::ModelRateLimiter& rate_limiter_config,
+      const bool device_blocking,
+      std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+          device_to_thread_map,
+      const std::vector<SecondaryDevice>& secondary_devices);
+  Status SetBackendThread(
+      const TRITONSERVER_InstanceGroupKind kind, const int32_t device_id,
+      const bool device_blocking,
+      std::map<uint32_t, std::shared_ptr<TritonBackendThread>>*
+          device_to_thread_map);
+  Status GenerateWarmupData();
+
+  void Execute(std::vector<TRITONBACKEND_Request*>& triton_requests);
+
+  class TritonBackendThread {
+   public:
+    static Status CreateBackendThread(
+        const std::string name, TritonModelInstance* model, const int nice,
+        const int32_t device_id,
+        std::unique_ptr<TritonBackendThread>* triton_backend_thread);
+    void AddModelInstance(TritonModelInstance* model_instance);
+    Status InitAndWarmUpModelInstance(TritonModelInstance* model_instance);
+    void StopBackendThread();
+    ~TritonBackendThread();
+
+   private:
+    TritonBackendThread(const std::string& name, TritonModel* model);
+    void BackendThread(const int nice, const int32_t device_id);
+
+    std::string name_;
+
+    TritonModel* model_;
+    std::deque<TritonModelInstance*> model_instances_;
+
+    std::thread backend_thread_;
+    std::atomic<bool> backend_thread_exit_;
+  };
+  std::shared_ptr<TritonBackendThread> triton_backend_thread_;
+
+  struct WarmupData {
+    WarmupData(const std::string& sample_name, const size_t count)
+        : sample_name_(sample_name), count_(std::max(count, size_t{1}))
+    {
+    }
+
+    std::string sample_name_;
+    size_t count_;
+    // Using a batch of requests to satisfy batch size, this provides better
+    // alignment on the batch expected by the model, especially for sequence
+    // model.
+    std::vector<std::unique_ptr<InferenceRequest>> requests_;
+
+    // Placeholder for input data
+    std::unique_ptr<AllocatedMemory> zero_data_;
+    std::unique_ptr<AllocatedMemory> random_data_;
+    std::vector<std::unique_ptr<std::string>> provided_data_;
+  };
+  std::vector<WarmupData> warmup_samples_;
+
+  // The TritonModel object that owns this instance. The instance
+  // holds this as a raw pointer because the lifetime of the model is
+  // guaranteed to be longer than the lifetime of an instance owned by the
+  // model.
+  TritonModel* model_;
+
+  std::string name_;
+  size_t index_;
+
+  // For CPU device_id_ is always 0. For GPU device_id_ indicates the
+  // GPU device to be used by the instance.
+  TRITONSERVER_InstanceGroupKind kind_;
+  int32_t device_id_;
+  const triton::common::HostPolicyCmdlineConfig host_policy_;
+  TritonServerMessage host_policy_message_;
+  std::vector<std::string> profile_names_;
+  bool passive_;
+
+  std::vector<SecondaryDevice> secondary_devices_;
+
+  // Reporter for metrics, or nullptr if no metrics should be reported
+  std::shared_ptr<MetricModelReporter> reporter_;
+
+  // Opaque state associated with this model instance.
+  void* state_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/buffer_attributes.cc
+++ b/3rdparty/core-r22.12/src/buffer_attributes.cc
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "buffer_attributes.h"
+
+#include <cstring>
+#include "constants.h"
+
+namespace triton { namespace core {
+void
+BufferAttributes::SetByteSize(const size_t& byte_size)
+{
+  byte_size_ = byte_size;
+}
+
+void
+BufferAttributes::SetMemoryType(const TRITONSERVER_MemoryType& memory_type)
+{
+  memory_type_ = memory_type;
+}
+
+void
+BufferAttributes::SetMemoryTypeId(const int64_t& memory_type_id)
+{
+  memory_type_id_ = memory_type_id;
+}
+
+void
+BufferAttributes::SetCudaIpcHandle(void* cuda_ipc_handle)
+{
+  char* lcuda_ipc_handle = reinterpret_cast<char*>(cuda_ipc_handle);
+  cuda_ipc_handle_.clear();
+  std::copy(
+      lcuda_ipc_handle, lcuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
+      std::back_inserter(cuda_ipc_handle_));
+}
+
+void*
+BufferAttributes::CudaIpcHandle()
+{
+  if (cuda_ipc_handle_.empty()) {
+    return nullptr;
+  } else {
+    return reinterpret_cast<void*>(cuda_ipc_handle_.data());
+  }
+}
+
+size_t
+BufferAttributes::ByteSize() const
+{
+  return byte_size_;
+}
+
+TRITONSERVER_MemoryType
+BufferAttributes::MemoryType() const
+{
+  return memory_type_;
+}
+
+int64_t
+BufferAttributes::MemoryTypeId() const
+{
+  return memory_type_id_;
+}
+
+BufferAttributes::BufferAttributes(
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, char* cuda_ipc_handle)
+    : byte_size_(byte_size), memory_type_(memory_type),
+      memory_type_id_(memory_type_id)
+{
+  // cuda ipc handle size
+  cuda_ipc_handle_.reserve(CUDA_IPC_STRUCT_SIZE);
+
+  if (cuda_ipc_handle != nullptr) {
+    std::copy(
+        cuda_ipc_handle, cuda_ipc_handle + CUDA_IPC_STRUCT_SIZE,
+        std::back_inserter(cuda_ipc_handle_));
+  }
+}
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/buffer_attributes.h
+++ b/3rdparty/core-r22.12/src/buffer_attributes.h
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <iterator>
+#include <vector>
+#include "tritonserver_apis.h"
+
+#pragma once
+
+namespace triton { namespace core {
+//
+// A class to hold information about the buffer allocation.
+//
+class BufferAttributes {
+ public:
+  BufferAttributes(
+      size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id, char cuda_ipc_handle[64]);
+  BufferAttributes()
+  {
+    memory_type_ = TRITONSERVER_MEMORY_CPU;
+    memory_type_id_ = 0;
+    cuda_ipc_handle_.reserve(64);
+  }
+
+  // Set the buffer byte size
+  void SetByteSize(const size_t& byte_size);
+
+  // Set the buffer memory_type
+  void SetMemoryType(const TRITONSERVER_MemoryType& memory_type);
+
+  // Set the buffer memory type id
+  void SetMemoryTypeId(const int64_t& memory_type_id);
+
+  // Set the cuda ipc handle
+  void SetCudaIpcHandle(void* cuda_ipc_handle);
+
+  // Get the cuda ipc handle
+  void* CudaIpcHandle();
+
+  // Get the byte size
+  size_t ByteSize() const;
+
+  // Get the memory type
+  TRITONSERVER_MemoryType MemoryType() const;
+
+  // Get the memory type id
+  int64_t MemoryTypeId() const;
+
+ private:
+  size_t byte_size_;
+  TRITONSERVER_MemoryType memory_type_;
+  int64_t memory_type_id_;
+  std::vector<char> cuda_ipc_handle_;
+};
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/constants.h
+++ b/3rdparty/core-r22.12/src/constants.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <stdint.h>
+
+namespace triton { namespace core {
+
+constexpr char kInferHeaderContentLengthHTTPHeader[] =
+    "Inference-Header-Content-Length";
+constexpr char kAcceptEncodingHTTPHeader[] = "Accept-Encoding";
+constexpr char kContentEncodingHTTPHeader[] = "Content-Encoding";
+constexpr char kContentTypeHeader[] = "Content-Type";
+constexpr char kContentLengthHeader[] = "Content-Length";
+
+constexpr char kTensorFlowGraphDefPlatform[] = "tensorflow_graphdef";
+constexpr char kTensorFlowSavedModelPlatform[] = "tensorflow_savedmodel";
+constexpr char kTensorFlowGraphDefFilename[] = "model.graphdef";
+constexpr char kTensorFlowSavedModelFilename[] = "model.savedmodel";
+constexpr char kTensorFlowBackend[] = "tensorflow";
+
+constexpr char kTensorRTPlanPlatform[] = "tensorrt_plan";
+constexpr char kTensorRTPlanFilename[] = "model.plan";
+constexpr char kTensorRTBackend[] = "tensorrt";
+
+constexpr char kOnnxRuntimeOnnxPlatform[] = "onnxruntime_onnx";
+constexpr char kOnnxRuntimeOnnxFilename[] = "model.onnx";
+constexpr char kOnnxRuntimeBackend[] = "onnxruntime";
+
+constexpr char kOpenVINORuntimeOpenVINOFilename[] = "model.xml";
+constexpr char kOpenVINORuntimeBackend[] = "openvino";
+
+constexpr char kPyTorchLibTorchPlatform[] = "pytorch_libtorch";
+constexpr char kPyTorchLibTorchFilename[] = "model.pt";
+constexpr char kPyTorchBackend[] = "pytorch";
+
+constexpr char kPythonFilename[] = "model.py";
+constexpr char kPythonBackend[] = "python";
+
+#ifdef TRITON_ENABLE_ENSEMBLE
+constexpr char kEnsemblePlatform[] = "ensemble";
+#endif  // TRITON_ENABLE_ENSEMBLE
+
+constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
+constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
+constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
+constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
+    "auto_mixed_precision";
+
+constexpr char kModelConfigPbTxt[] = "config.pbtxt";
+
+constexpr char kMetricsLabelModelName[] = "model";
+constexpr char kMetricsLabelModelVersion[] = "version";
+constexpr char kMetricsLabelGpuUuid[] = "gpu_uuid";
+
+constexpr char kWarmupDataFolder[] = "warmup";
+constexpr char kInitialStateFolder[] = "initial_state";
+
+constexpr uint64_t NANOS_PER_SECOND = 1000000000;
+constexpr uint64_t NANOS_PER_MILLIS = 1000000;
+constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
+constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000;
+constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128;
+constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
+
+#ifdef TRITON_ENABLE_METRICS
+// MetricModelReporter expects a device ID for GPUs, but we reuse this device
+// ID for other metrics as well such as for CPU and Response Cache metrics
+constexpr int METRIC_REPORTER_ID_CPU = -1;
+constexpr int METRIC_REPORTER_ID_RESPONSE_CACHE = -2;
+#endif
+
+#define TIMESPEC_TO_NANOS(TS) \
+  ((TS).tv_sec * triton::core::NANOS_PER_SECOND + (TS).tv_nsec)
+#define TIMESPEC_TO_MILLIS(TS) \
+  (TIMESPEC_TO_NANOS(TS) / triton::core::NANOS_PER_MILLIS)
+
+#define DISALLOW_MOVE(TypeName) TypeName(Context&& o) = delete;
+#define DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete;
+#define DISALLOW_ASSIGN(TypeName) void operator=(const TypeName&) = delete;
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  DISALLOW_COPY(TypeName)                  \
+  DISALLOW_ASSIGN(TypeName)
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_memory_manager.cc
+++ b/3rdparty/core-r22.12/src/cuda_memory_manager.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#include "cuda_memory_manager.h"
+
+#include <cnmem.h>
+#include <string.h>
+#include <set>
+#include "cuda_utils.h"
+#include "triton/common/logging.h"
+
+namespace {
+
+#define RETURN_IF_CNMEM_ERROR(S, MSG)                    \
+  do {                                                   \
+    auto status__ = (S);                                 \
+    if (status__ != CNMEM_STATUS_SUCCESS) {              \
+      return Status(                                     \
+          Status::Code::INTERNAL,                        \
+          (MSG) + ": " + cnmemGetErrorString(status__)); \
+    }                                                    \
+  } while (false)
+
+std::string
+PointerToString(void* ptr)
+{
+  std::stringstream ss;
+  ss << ptr;
+  return ss.str();
+}
+
+}  // namespace
+
+namespace triton { namespace core {
+
+std::unique_ptr<CudaMemoryManager> CudaMemoryManager::instance_;
+std::mutex CudaMemoryManager::instance_mu_;
+
+CudaMemoryManager::~CudaMemoryManager()
+{
+  if (has_allocation_) {
+    auto status = cnmemFinalize();
+    if (status != CNMEM_STATUS_SUCCESS) {
+      LOG_ERROR << "Failed to finalize CUDA memory manager: [" << status << "] "
+                << cnmemGetErrorString(status);
+    }
+  }
+}
+
+void
+CudaMemoryManager::Reset()
+{
+  std::lock_guard<std::mutex> lock(instance_mu_);
+  instance_.reset();
+}
+
+Status
+CudaMemoryManager::Create(const CudaMemoryManager::Options& options)
+{
+  // Ensure thread-safe creation of CUDA memory pool
+  std::lock_guard<std::mutex> lock(instance_mu_);
+  if (instance_ != nullptr) {
+    LOG_WARNING << "New CUDA memory pools could not be created since they "
+                   "already exists";
+    return Status::Success;
+  }
+
+  std::set<int> supported_gpus;
+  auto status = GetSupportedGPUs(
+      &supported_gpus, options.min_supported_compute_capability_);
+  if (status.IsOk()) {
+    std::vector<cnmemDevice_t> devices;
+    for (auto gpu : supported_gpus) {
+      const auto it = options.memory_pool_byte_size_.find(gpu);
+      if ((it != options.memory_pool_byte_size_.end()) && (it->second != 0)) {
+        devices.emplace_back();
+        auto& device = devices.back();
+        memset(&device, 0, sizeof(device));
+        device.device = gpu;
+        device.size = it->second;
+
+        LOG_INFO << "CUDA memory pool is created on device " << device.device
+                 << " with size " << device.size;
+      }
+    }
+
+    if (!devices.empty()) {
+      RETURN_IF_CNMEM_ERROR(
+          cnmemInit(devices.size(), devices.data(), CNMEM_FLAGS_CANNOT_GROW),
+          std::string("Failed to finalize CUDA memory manager"));
+    } else {
+      LOG_INFO << "CUDA memory pool disabled";
+    }
+
+    // Use to finalize CNMeM properly when out of scope
+    instance_.reset(new CudaMemoryManager(!devices.empty()));
+  } else {
+    return Status(
+        status.ErrorCode(),
+        "Failed to initialize CUDA memory manager: " + status.Message());
+  }
+
+  return Status::Success;
+}
+
+Status
+CudaMemoryManager::Alloc(void** ptr, uint64_t size, int64_t device_id)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
+  } else if (!instance_->has_allocation_) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        "CudaMemoryManager has no preallocated CUDA memory");
+  }
+
+  int current_device;
+  RETURN_IF_CUDA_ERR(
+      cudaGetDevice(&current_device), std::string("Failed to get device"));
+  bool overridden = (current_device != device_id);
+  if (overridden) {
+    RETURN_IF_CUDA_ERR(
+        cudaSetDevice(device_id), std::string("Failed to set device"));
+  }
+
+  // Defer returning error to make sure the device is recovered
+  auto err = cnmemMalloc(ptr, size, nullptr);
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  RETURN_IF_CNMEM_ERROR(
+      err, std::string("Failed to allocate CUDA memory with byte size ") +
+               std::to_string(size) + " on GPU " + std::to_string(device_id));
+  return Status::Success;
+}
+
+Status
+CudaMemoryManager::Free(void* ptr, int64_t device_id)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "CudaMemoryManager has not been created");
+  } else if (!instance_->has_allocation_) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        "CudaMemoryManager has no preallocated CUDA memory");
+  }
+
+  int current_device;
+  RETURN_IF_CUDA_ERR(
+      cudaGetDevice(&current_device), std::string("Failed to get device"));
+  bool overridden = (current_device != device_id);
+  if (overridden) {
+    RETURN_IF_CUDA_ERR(
+        cudaSetDevice(device_id), std::string("Failed to set device"));
+  }
+
+  // Defer returning error to make sure the device is recovered
+  auto err = cnmemFree(ptr, nullptr);
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  RETURN_IF_CNMEM_ERROR(
+      err, std::string("Failed to deallocate CUDA memory at address ") +
+               PointerToString(ptr) + " on GPU " + std::to_string(device_id));
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_memory_manager.h
+++ b/3rdparty/core-r22.12/src/cuda_memory_manager.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include "status.h"
+
+namespace triton { namespace core {
+
+// This is a singleton class responsible for maintaining CUDA memory pool
+// used by the inference server. CUDA memory allocations and deallocations
+// must be requested via functions provided by this class.
+class CudaMemoryManager {
+ public:
+  // Options to configure CUDA memory manager.
+  struct Options {
+    Options(double cc = 6.0, const std::map<int, uint64_t>& s = {})
+        : min_supported_compute_capability_(cc), memory_pool_byte_size_(s)
+    {
+    }
+
+    // The minimum compute capability of the supported devices.
+    double min_supported_compute_capability_;
+
+    // The size of CUDA memory reserved for the specified devices.
+    // The memory size will be rounded up to align with
+    // the default granularity (512 bytes).
+    // No memory will be reserved for devices that is not listed.
+    std::map<int, uint64_t> memory_pool_byte_size_;
+  };
+
+  ~CudaMemoryManager();
+
+  // Create the memory manager based on 'options' specified.
+  // Return Status object indicating success or failure.
+  static Status Create(const Options& options);
+
+  // Allocate CUDA memory on GPU 'device_id' with
+  // the requested 'size' and return the pointer in 'ptr'.
+  // Return Status object indicating success or failure.
+  static Status Alloc(void** ptr, uint64_t size, int64_t device_id);
+
+  // Free the memory allocated by the memory manager on 'device_id'.
+  // Return Status object indicating success or failure.
+  static Status Free(void* ptr, int64_t device_id);
+
+ protected:
+  // Provide explicit control on the lifecycle of the CUDA memory manager,
+  // for testing only.
+  static void Reset();
+
+ private:
+  CudaMemoryManager(bool has_allocation) : has_allocation_(has_allocation) {}
+  bool has_allocation_;
+  static std::unique_ptr<CudaMemoryManager> instance_;
+  static std::mutex instance_mu_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_utils.cc
+++ b/3rdparty/core-r22.12/src/cuda_utils.cc
+// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "cuda_utils.h"
+
+#include "model_config_utils.h"
+#include "triton/common/nvtx.h"
+
+namespace triton { namespace core {
+
+#ifdef TRITON_ENABLE_GPU
+void CUDART_CB
+MemcpyHost(void* args)
+{
+  auto* copy_params = reinterpret_cast<CopyParams*>(args);
+  memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
+  delete copy_params;
+}
+#endif  // TRITON_ENABLE_GPU
+
+Status
+GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total)
+{
+  *free = 0;
+  *total = 0;
+#ifdef TRITON_ENABLE_GPU
+  // Make sure that correct device is set before creating stream and
+  // then restore the device to what was set by the caller.
+  int current_device;
+  auto cuerr = cudaGetDevice(&current_device);
+  bool overridden = false;
+  if (cuerr == cudaSuccess) {
+    overridden = (current_device != device_id);
+    if (overridden) {
+      cuerr = cudaSetDevice(device_id);
+    }
+  }
+
+  if (cuerr == cudaSuccess) {
+    cuerr = cudaMemGetInfo(free, total);
+  }
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        (std::string("unable to get memory info for device ") +
+         std::to_string(device_id) + ": " + cudaGetErrorString(cuerr)));
+  }
+#endif  // TRITON_ENABLE_GPU
+  return Status::Success;
+}
+
+Status
+EnablePeerAccess(const double min_compute_capability)
+{
+#ifdef TRITON_ENABLE_GPU
+  // If we can't enable peer access for one device pair, the best we can
+  // do is skipping it...
+  std::set<int> supported_gpus;
+  bool all_enabled = false;
+  if (GetSupportedGPUs(&supported_gpus, min_compute_capability).IsOk()) {
+    all_enabled = true;
+    int can_access_peer = false;
+    for (const auto& host : supported_gpus) {
+      auto cuerr = cudaSetDevice(host);
+
+      if (cuerr == cudaSuccess) {
+        for (const auto& peer : supported_gpus) {
+          if (host == peer) {
+            continue;
+          }
+
+          cuerr = cudaDeviceCanAccessPeer(&can_access_peer, host, peer);
+          if ((cuerr == cudaSuccess) && (can_access_peer == 1)) {
+            cuerr = cudaDeviceEnablePeerAccess(peer, 0);
+          }
+
+          all_enabled &= ((cuerr == cudaSuccess) && (can_access_peer == 1));
+        }
+      }
+    }
+  }
+  if (!all_enabled) {
+    return Status(
+        Status::Code::UNSUPPORTED,
+        "failed to enable peer access for some device pairs");
+  }
+#endif  // TRITON_ENABLE_GPU
+  return Status::Success;
+}
+
+Status
+CopyBuffer(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, bool* cuda_used, bool copy_on_stream)
+{
+  NVTX_RANGE(nvtx_, "CopyBuffer");
+
+  *cuda_used = false;
+
+  // For CUDA memcpy, all host to host copy will be blocked in respect to the
+  // host, so use memcpy() directly. In this case, need to be careful on whether
+  // the src buffer is valid.
+  if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
+      (dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
+#ifdef TRITON_ENABLE_GPU
+    if (copy_on_stream) {
+      auto params = new CopyParams(dst, src, byte_size);
+      cudaLaunchHostFunc(
+          cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
+      *cuda_used = true;
+    } else {
+      memcpy(dst, src, byte_size);
+    }
+#else
+    memcpy(dst, src, byte_size);
+#endif  // TRITON_ENABLE_GPU
+  } else {
+#ifdef TRITON_ENABLE_GPU
+    RETURN_IF_CUDA_ERR(
+        cudaMemcpyAsync(dst, src, byte_size, cudaMemcpyDefault, cuda_stream),
+        msg + ": failed to perform CUDA copy");
+
+    *cuda_used = true;
+#else
+    return Status(
+        Status::Code::INTERNAL,
+        msg + ": try to use CUDA copy while GPU is not supported");
+#endif  // TRITON_ENABLE_GPU
+  }
+
+  return Status::Success;
+}
+
+void
+CopyBufferHandler(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, void* response_ptr,
+    triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
+        completion_queue)
+{
+  bool cuda_used = false;
+  Status status = CopyBuffer(
+      msg, src_memory_type, src_memory_type_id, dst_memory_type,
+      dst_memory_type_id, byte_size, src, dst, cuda_stream, &cuda_used);
+  completion_queue->Put(std::make_tuple(status, cuda_used, response_ptr));
+}
+
+#ifdef TRITON_ENABLE_GPU
+Status
+CheckGPUCompatibility(const int gpu_id, const double min_compute_capability)
+{
+  // Query the compute capability from the device
+  cudaDeviceProp cuprops;
+  cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to get CUDA device properties for GPU ID" +
+            std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
+  }
+
+  double compute_compability = cuprops.major + (cuprops.minor / 10.0);
+  if ((compute_compability > min_compute_capability) ||
+      (abs(compute_compability - min_compute_capability) < 0.01)) {
+    return Status::Success;
+  } else {
+    return Status(
+        Status::Code::UNSUPPORTED,
+        "gpu " + std::to_string(gpu_id) + " has compute capability '" +
+            std::to_string(cuprops.major) + "." +
+            std::to_string(cuprops.minor) +
+            "' which is less than the minimum supported of '" +
+            std::to_string(min_compute_capability) + "'");
+  }
+}
+
+Status
+GetSupportedGPUs(
+    std::set<int>* supported_gpus, const double min_compute_capability)
+{
+  // Make sure set is empty before starting
+  supported_gpus->clear();
+
+  int device_cnt;
+  cudaError_t cuerr = cudaGetDeviceCount(&device_cnt);
+  if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) {
+    device_cnt = 0;
+  } else if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL, "unable to get number of CUDA devices: " +
+                                    std::string(cudaGetErrorString(cuerr)));
+  }
+
+  // populates supported_gpus
+  for (int gpu_id = 0; gpu_id < device_cnt; gpu_id++) {
+    Status status = CheckGPUCompatibility(gpu_id, min_compute_capability);
+    if (status.IsOk()) {
+      supported_gpus->insert(gpu_id);
+    }
+  }
+  return Status::Success;
+}
+
+Status
+SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support)
+{
+  // Query the device to check if integrated
+  cudaDeviceProp cuprops;
+  cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, gpu_id);
+  if (cuerr != cudaSuccess) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to get CUDA device properties for GPU ID" +
+            std::to_string(gpu_id) + ": " + cudaGetErrorString(cuerr));
+  }
+
+  // Zero-copy supported only on integrated GPU when it can map host memory
+  if (cuprops.integrated && cuprops.canMapHostMemory) {
+    *zero_copy_support = true;
+  } else {
+    *zero_copy_support = false;
+  }
+
+  return Status::Success;
+}
+
+#endif
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/cuda_utils.h
+++ b/3rdparty/core-r22.12/src/cuda_utils.h
+// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <set>
+#include "status.h"
+#include "triton/common/sync_queue.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace core {
+
+#ifdef TRITON_ENABLE_GPU
+#define RETURN_IF_CUDA_ERR(X, MSG)                                           \
+  do {                                                                       \
+    cudaError_t err__ = (X);                                                 \
+    if (err__ != cudaSuccess) {                                              \
+      return Status(                                                         \
+          Status::Code::INTERNAL, (MSG) + ": " + cudaGetErrorString(err__)); \
+    }                                                                        \
+  } while (false)
+#endif  // TRITON_ENABLE_GPU
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+/// Get the memory info for the specified device.
+/// \param device_id The device ID.
+/// \param free Return free memory in bytes.
+/// \param total Return total memory in bytes.
+/// \return The error status. A non-OK status means failure to get memory info.
+Status GetDeviceMemoryInfo(const int device_id, size_t* free, size_t* total);
+
+/// Enable peer access for all GPU device pairs
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \return The error status. A non-OK status means not all pairs are enabled
+Status EnablePeerAccess(const double min_compute_capability);
+
+/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location
+/// is identified by the memory type and id, and the corresponding copy will be
+/// initiated.
+/// \param msg The message to be prepended in error message.
+/// \param src_memory_type The memory type CPU/GPU of the source.
+/// \param src_memory_type_id The device id of the source.
+/// \param dst_memory_type The memory type CPU/GPU of the destination.
+/// \param dst_memory_type_id The device id of the destination.
+/// \param byte_size The size in bytes to me copied from source to destination.
+/// \param src The buffer start address of the source.
+/// \param dst The buffer start address of the destination.
+/// \param cuda_stream The stream to be associated with, and 0 can be
+/// passed for default stream.
+/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
+/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
+/// is completed.
+/// \param copy_on_stream whether the memory copies should be performed in cuda
+/// host functions on the 'cuda_stream'.
+/// \return The error status. A non-ok status indicates failure to copy the
+/// buffer.
+Status CopyBuffer(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, bool* cuda_used,
+    bool copy_on_stream = false);
+
+#ifdef TRITON_ENABLE_GPU
+/// Validates the compute capability of the GPU indexed
+/// \param gpu_id The index of the target GPU.
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \return The error status. A non-OK status means the target GPU is
+/// not supported.
+Status CheckGPUCompatibility(
+    const int gpu_id, const double min_compute_capability);
+
+/// Obtains a set of gpu ids that is supported by triton.
+/// \param supported_gpus Returns the set of integers which is
+///  populated by ids of supported GPUS
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \return The error status. A non-ok status means there were
+/// errors encountered while querying GPU devices.
+Status GetSupportedGPUs(
+    std::set<int>* supported_gpus, const double min_compute_capability);
+
+/// Checks if the GPU specified is an integrated GPU and supports Zero-copy.
+/// \param gpu_id The index of the target GPU.
+/// \param zero_copy_support If true, Zero-copy is supported by this GPU.
+/// \return The error status. A non-OK status means the target GPU is
+/// not supported.
+Status SupportsIntegratedZeroCopy(const int gpu_id, bool* zero_copy_support);
+#endif
+
+// Helper around CopyBuffer that updates the completion queue with the returned
+// status and cuda_used flag.
+void CopyBufferHandler(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, void* response_ptr,
+    triton::common::SyncQueue<std::tuple<Status, bool, void*>>*
+        completion_queue);
+
+struct CopyParams {
+  CopyParams(void* dst, const void* src, const size_t byte_size)
+      : dst_(dst), src_(src), byte_size_(byte_size)
+  {
+  }
+
+  void* dst_;
+  const void* src_;
+  const size_t byte_size_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/dynamic_batch_scheduler.cc
+++ b/3rdparty/core-r22.12/src/dynamic_batch_scheduler.cc
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "dynamic_batch_scheduler.h"
+
+#ifndef _WIN32
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#include "constants.h"
+#include "server.h"
+#include "triton/common/logging.h"
+#include "triton/common/model_config.h"
+#include "triton/common/nvtx.h"
+
+namespace triton { namespace core {
+
+bool
+IsStaleState(Payload::State payload_state)
+{
+  return (
+      (payload_state == Payload::State::EXECUTING) ||
+      (payload_state == Payload::State::RELEASED));
+}
+
+DynamicBatchScheduler::DynamicBatchScheduler(
+    TritonModel* model, TritonModelInstance* model_instance,
+    const bool dynamic_batching_enabled, const int32_t max_batch_size,
+    const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+    const bool preserve_ordering, const bool response_cache_enable,
+    const std::set<int32_t>& preferred_batch_sizes,
+    const uint64_t max_queue_delay_microseconds,
+    const inference::ModelQueuePolicy& default_queue_policy,
+    const uint32_t priority_levels, const ModelQueuePolicyMap& queue_policy_map)
+    : model_(model), model_instance_(model_instance),
+      model_name_(model->Name()),
+      dynamic_batching_enabled_(dynamic_batching_enabled),
+      queue_(default_queue_policy, priority_levels, queue_policy_map),
+      stop_(false), max_batch_size_((size_t)std::max(1, max_batch_size)),
+      preferred_batch_sizes_(preferred_batch_sizes),
+      pending_batch_delay_ns_(max_queue_delay_microseconds * 1000),
+      pending_batch_size_(0), queued_batch_size_(0),
+      next_preferred_batch_size_(0),
+      enforce_equal_shape_tensors_(enforce_equal_shape_tensors),
+      has_optional_input_(false), preserve_ordering_(preserve_ordering)
+{
+  rate_limiter_ = model_->Server()->GetRateLimiter();
+  // Both the server and model config should specify
+  // caching enabled for model to utilize response cache.
+  response_cache_enabled_ =
+      (model_->Server()->ResponseCacheEnabled() && response_cache_enable);
+#ifdef TRITON_ENABLE_METRICS
+  // Initialize metric reporter for cache statistics if cache enabled
+  if (response_cache_enabled_) {
+    MetricModelReporter::Create(
+        model_name_, model_->Version(), METRIC_REPORTER_ID_RESPONSE_CACHE,
+        model_->Config().metric_tags(), &reporter_);
+  }
+#endif  // TRITON_ENABLE_METRICS
+  max_preferred_batch_size_ = 0;
+  for (const auto size : preferred_batch_sizes_) {
+    max_preferred_batch_size_ =
+        std::max(max_preferred_batch_size_, (size_t)size);
+  }
+
+  for (const auto& input : model_->Config().input()) {
+    if (input.optional()) {
+      has_optional_input_ = true;
+      break;
+    }
+  }
+}
+
+Status
+DynamicBatchScheduler::Create(
+    TritonModel* model, TritonModelInstance* model_instance, const int nice,
+    const bool dynamic_batching_enabled, const int32_t max_batch_size,
+    const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+    const bool preserve_ordering, const bool response_cache_enable,
+    const std::set<int32_t>& preferred_batch_sizes,
+    const uint64_t max_queue_delay_microseconds,
+    std::unique_ptr<Scheduler>* scheduler)
+{
+  inference::ModelDynamicBatching batcher_config;
+  batcher_config.set_preserve_ordering(preserve_ordering);
+  for (const auto& bs : preferred_batch_sizes) {
+    batcher_config.add_preferred_batch_size(bs);
+  }
+  batcher_config.set_max_queue_delay_microseconds(max_queue_delay_microseconds);
+
+  return Create(
+      model, model_instance, nice, dynamic_batching_enabled, max_batch_size,
+      enforce_equal_shape_tensors, batcher_config, response_cache_enable,
+      scheduler);
+}
+
+Status
+DynamicBatchScheduler::Create(
+    TritonModel* model, TritonModelInstance* model_instance, const int nice,
+    const bool dynamic_batching_enabled, const int32_t max_batch_size,
+    const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+    const inference::ModelDynamicBatching& batcher_config,
+    const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler)
+{
+  std::set<int32_t> preferred_batch_sizes;
+  for (const auto size : batcher_config.preferred_batch_size()) {
+    preferred_batch_sizes.insert(size);
+  }
+
+  DynamicBatchScheduler* dyna_sched = new DynamicBatchScheduler(
+      model, model_instance, dynamic_batching_enabled, max_batch_size,
+      enforce_equal_shape_tensors, batcher_config.preserve_ordering(),
+      response_cache_enable, preferred_batch_sizes,
+      batcher_config.max_queue_delay_microseconds(),
+      batcher_config.default_queue_policy(), batcher_config.priority_levels(),
+      batcher_config.priority_queue_policy());
+  std::unique_ptr<DynamicBatchScheduler> sched(dyna_sched);
+
+  sched->scheduler_thread_exit_.store(false);
+  if (dynamic_batching_enabled) {
+    sched->NewPayload();
+    sched->scheduler_thread_ =
+        std::thread([dyna_sched, nice]() { dyna_sched->BatcherThread(nice); });
+  }
+
+  scheduler->reset(sched.release());
+
+  return Status::Success;
+}
+
+DynamicBatchScheduler::~DynamicBatchScheduler()
+{
+  // Signal the scheduler thread to exit and then wait for it..
+  scheduler_thread_exit_.store(true);
+  cv_.notify_one();
+  if (scheduler_thread_.joinable()) {
+    scheduler_thread_.join();
+  }
+}
+
+Status
+DynamicBatchScheduler::Enqueue(std::unique_ptr<InferenceRequest>& request)
+{
+  if (stop_) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        request->LogRequest() +
+            "Server is stopping, scheduler for model has stopped accepting new "
+            "inference requests");
+  }
+  // If queue start timestamp hasn't been set, queue timer starts at
+  // the beginning of the queueing and scheduling process. Otherwise,
+  // dynamic batcher is used as component of another batcher and should not
+  // overwrite the queue start timestamp.
+  if (request->QueueStartNs() == 0) {
+    request->CaptureQueueStartNs();
+    INFER_TRACE_ACTIVITY(
+        request->Trace(), TRITONSERVER_TRACE_QUEUE_START,
+        request->QueueStartNs());
+#ifdef TRITON_ENABLE_TRACING
+    request->TraceInputTensors(
+        TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT, "DynamicBatchScheduler Enqueue");
+#endif  // TRITON_ENABLE_TRACING
+  }
+
+  // Record time at the beginning of the batcher queueing. In the case of
+  // oldest sequence batcher, this will overwrite the value that was previously
+  // set by sequence batcher, which is okay as by this point, the previous
+  // batcher won't be needing this value and it can be safely reused by
+  // the dynamic batcher.
+  request->CaptureBatcherStartNs();
+
+  std::unique_ptr<InferenceResponse> cached_response;
+
+  if (response_cache_enabled_) {
+    CacheLookUp(request, cached_response);
+  }
+
+  if (cached_response != nullptr) {
+    // If there was a cache hit then try sending the cached response
+    // and release the request.
+    if (preserve_ordering_) {
+      // In order to preserve the order, the response send must be
+      // delegated.
+      DelegateResponse(request);
+    }
+
+    // Send cached response and release request
+    InferenceResponse::Send(
+        std::move(cached_response), TRITONSERVER_RESPONSE_COMPLETE_FINAL);
+    InferenceRequest::Release(
+        std::move(request), TRITONSERVER_REQUEST_RELEASE_ALL);
+
+    return Status::Success;
+  }
+
+  if (!dynamic_batching_enabled_) {
+    if (preserve_ordering_ || response_cache_enabled_) {
+      DelegateResponse(request);
+    }
+    // If not using dynamic batching, directly enqueue the
+    // request to model for execution
+    auto payload = model_->Server()->GetRateLimiter()->GetPayload(
+        Payload::Operation::INFER_RUN, nullptr /* TritonModelInstance*/);
+    payload->AddRequest(std::move(request));
+    RETURN_IF_ERROR(
+        model_->Server()->GetRateLimiter()->EnqueuePayload(model_, payload));
+
+  } else {
+    bool wake_batcher = true;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+
+      queued_batch_size_ += std::max(1U, request->BatchSize());
+
+      // Assuming no error is returned, this call takes ownership of
+      // 'request' and so we can't use it after this point.
+      RETURN_IF_ERROR(queue_.Enqueue(request->Priority(), request));
+
+      // If there are any idle runners and the queued batch size is greater or
+      // equal to next preferred batch size, then wake batcher up to service
+      // this request. We do the actual wake outside of the lock to avoid
+      // having the woken thread immediately block on the lock
+      wake_batcher =
+          model_->Server()->GetRateLimiter()->PayloadSlotAvailable(model_);
+
+      // We may wake up runner less often if we don't enforce equal shape
+      // within a batch, otherwise must always wake up runner to check it
+      if (enforce_equal_shape_tensors_.empty()) {
+        std::lock_guard<std::mutex> exec_lock(*(curr_payload_->GetExecMutex()));
+        auto payload_state = curr_payload_->GetState();
+        wake_batcher &=
+            (payload_saturated_ || IsStaleState(payload_state) ||
+             (queued_batch_size_ >= next_preferred_batch_size_));
+      }
+    }
+
+    if (wake_batcher) {
+      cv_.notify_one();
+    }
+  }
+
+  return Status::Success;
+}
+
+void
+DynamicBatchScheduler::NewPayload()
+{
+  curr_payload_ = model_->Server()->GetRateLimiter()->GetPayload(
+      Payload::Operation::INFER_RUN, model_instance_);
+  payload_saturated_ = false;
+}
+
+void
+DynamicBatchScheduler::BatcherThread(const int nice)
+{
+#ifndef _WIN32
+  if (setpriority(PRIO_PROCESS, syscall(SYS_gettid), nice) == 0) {
+    LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
+                   << " at nice " << nice << "...";
+  } else {
+    LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
+                   << " at default nice (requested nice " << nice
+                   << " failed)...";
+  }
+#else
+  LOG_VERBOSE(1) << "Starting dynamic-batcher thread for " << model_name_
+                 << " at default nice...";
+#endif
+  // For debugging/testing, delay start of threads until the queue
+  // contains the specified number of entries.
+  size_t delay_cnt = 0;
+  {
+    const char* dstr = getenv("TRITONSERVER_DELAY_SCHEDULER");
+    if (dstr != nullptr) {
+      delay_cnt = atoi(dstr);
+      LOG_VERBOSE(1) << "Delaying batcher thread for " << model_name_
+                     << " until " << delay_cnt << " queued requests...";
+    }
+  }
+
+  auto wait_for_slots = [this]() {
+    return model_->Server()->GetRateLimiter()->PayloadSlotAvailable(model_);
+  };
+  const uint64_t default_wait_microseconds = 500 * 1000;
+
+  while (!scheduler_thread_exit_.load()) {
+    NVTX_RANGE(nvtx_, "DynamicBatcher " + model_name_);
+
+    std::shared_ptr<std::vector<std::deque<std::unique_ptr<InferenceRequest>>>>
+        rejected_requests;
+    uint64_t wait_microseconds = 0;
+
+    // Hold the lock for as short a time as possible.
+    {
+      std::unique_lock<std::mutex> lock(mu_);
+      {
+        std::lock_guard<std::mutex> exec_lock(*(curr_payload_->GetExecMutex()));
+        auto payload_state = curr_payload_->GetState();
+        if (payload_saturated_ || IsStaleState(payload_state)) {
+          NewPayload();
+          next_preferred_batch_size_ = 0;
+        }
+      }
+
+      if (delay_cnt > 0) {
+        // Debugging/testing... wait until queue contains 'delay_cnt'
+        // items...
+        wait_microseconds = 10 * 1000;
+        if (queue_.Size() >= delay_cnt) {
+          delay_cnt = 0;
+        }
+        LOG_VERBOSE(1) << "Delaying batcher thread " << model_name_ << " until "
+                       << delay_cnt
+                       << " queued requests, current total = " << queue_.Size();
+      } else if (queue_.Empty()) {
+        wait_microseconds = default_wait_microseconds;
+      } else {
+        if (payload_saturated_) {
+          continue;
+        }
+        cv_.wait(lock, wait_for_slots);
+        {
+          std::lock_guard<std::mutex> exec_lock(
+              *(curr_payload_->GetExecMutex()));
+
+          auto payload_state = curr_payload_->GetState();
+          if (IsStaleState(payload_state)) {
+            continue;
+          }
+
+          // Use dynamic batching to get request(s) to execute.
+          wait_microseconds = GetDynamicBatch();
+
+          // Get requests that are rejected from searching dynamic batch.
+          queue_.ReleaseRejectedRequests(&rejected_requests);
+
+          // Extract batch only if there is pending batch
+          auto pending_batch_queue_cnt = queue_.PendingBatchCount();
+          if ((wait_microseconds == 0) && (pending_batch_queue_cnt != 0)) {
+            curr_payload_->ReserveRequests(pending_batch_queue_cnt);
+            for (size_t idx = 0; idx < pending_batch_queue_cnt; ++idx) {
+              std::unique_ptr<InferenceRequest> request;
+              auto status = queue_.Dequeue(&request);
+              if (status.IsOk()) {
+                if (preserve_ordering_ || response_cache_enabled_) {
+                  DelegateResponse(request);
+                }
+                curr_payload_->AddRequest(std::move(request));
+              } else {
+                // The queue is empty which conflicts with pending batch
+                // count. Send the current batch if any and reset related
+                // variables.
+                LOG_ERROR << request->LogRequest()
+                          << "Failed to retrieve request from scheduler queue: "
+                          << status.Message();
+                queue_.ResetCursor();
+                queued_batch_size_ = 0;
+                pending_batch_size_ = 0;
+                break;
+              }
+            }
+
+            if (curr_payload_->GetState() == Payload::State::UNINITIALIZED) {
+              curr_payload_->SetState(Payload::State::READY);
+            }
+
+            queued_batch_size_ -= pending_batch_size_;
+            pending_batch_size_ = 0;
+          }
+        }
+      }
+
+      // If no requests are to be handled, wait for notification or
+      // for the specified timeout before checking the queue again.
+      if (wait_microseconds > 0) {
+        std::chrono::microseconds wait_timeout(wait_microseconds);
+        cv_.wait_for(lock, wait_timeout);
+      }
+    }
+
+    if (curr_payload_->GetState() == Payload::State::READY) {
+      auto callback = [this]() { cv_.notify_one(); };
+      curr_payload_->SetCallback(callback);
+      model_->Server()->GetRateLimiter()->EnqueuePayload(model_, curr_payload_);
+    }
+
+    // Finish rejected requests if any
+    if (rejected_requests != nullptr) {
+      static Status rejected_status =
+          Status(Status::Code::UNAVAILABLE, "Request timeout expired");
+      for (auto& rejected_queue : *rejected_requests) {
+        for (auto& rejected_request : rejected_queue) {
+          InferenceRequest::RespondIfError(
+              rejected_request, rejected_status, true);
+        }
+      }
+    }
+  }  // end runner loop
+
+  LOG_VERBOSE(1) << "Stopping dynamic-batcher thread for " << model_name_
+                 << "...";
+}
+
+uint64_t
+DynamicBatchScheduler::GetDynamicBatch()
+{
+  // 'mu_' mutex must be held when this function is called. queue_
+  // must not be empty.
+
+  // Examine the new requests. If adding these new requests to the
+  // pending batch allows a preferred batch size then execute it
+  // immediately. Stop examining requests if the maximum preferred
+  // batch size would be exceeded or if the shape of the next request
+  // does not match the shape of the pending batch.
+  bool send_now = false;
+  if (!queue_.IsCursorValid()) {
+    queue_.ResetCursor();
+    pending_batch_size_ = 0;
+  }
+  size_t best_preferred_batch_size = 0;
+  queued_batch_size_ -= queue_.ApplyPolicyAtCursor();
+
+  // When there is optional input or input shape must be enforced,
+  // the inputs in the requests must be examined for forming a batch
+  const bool check_input =
+      !enforce_equal_shape_tensors_.empty() || has_optional_input_;
+  auto payload_batch_size = curr_payload_->BatchSize();
+  while (!queue_.CursorEnd()) {
+    const auto batch_size = std::max(1U, queue_.RequestAtCursor()->BatchSize());
+
+    // If there is no pending batch, then this request is starting a
+    // new batch.
+    if ((payload_batch_size + queue_.PendingBatchCount()) == 0) {
+      // Get the shape of the new batch that is being started...
+      if (check_input) {
+        if (!curr_payload_->MutableRequiredEqualInputs()
+                 ->Initialize(
+                     queue_.RequestAtCursor(), enforce_equal_shape_tensors_,
+                     has_optional_input_)
+                 .IsOk()) {
+          send_now = true;
+          break;
+        }
+      }
+    } else {
+      // There is a pending batch and adding this request would make
+      // the batch size larger than all of the preferred batch sizes,
+      // so mark the cursor at this point. Not sending the pending batch so
+      // that we can examine the queue delay of requests that fits in a batch.
+      if (((payload_batch_size + pending_batch_size_ + batch_size) >
+           max_preferred_batch_size_) &&
+          (best_preferred_batch_size == 0)) {
+        best_preferred_batch_size = pending_batch_size_;
+        queue_.MarkCursor();
+        payload_saturated_ = true;
+      }
+      if ((payload_batch_size + pending_batch_size_ + batch_size) >
+          max_batch_size_) {
+        send_now = true;
+        break;
+      }
+
+      // There is a pending batch and it has a different shape then
+      // this request, so send the pending batch as it is.
+      if (check_input &&
+          !curr_payload_->MutableRequiredEqualInputs()->HasEqualInputs(
+              queue_.RequestAtCursor())) {
+        curr_payload_->MarkSaturated();
+        send_now = true;
+        break;
+      }
+    }
+
+    pending_batch_size_ += batch_size;
+    queue_.AdvanceCursor();
+    queued_batch_size_ -= queue_.ApplyPolicyAtCursor();
+
+    if (preferred_batch_sizes_.find(pending_batch_size_ + payload_batch_size) !=
+        preferred_batch_sizes_.end()) {
+      best_preferred_batch_size = pending_batch_size_;
+      queue_.MarkCursor();
+    }
+  }
+
+  // Obatin the age of the oldest pending request to compare with the maximum
+  // batch queuing delay
+  uint64_t now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now().time_since_epoch())
+                        .count();
+  uint64_t delay_ns = now_ns - queue_.OldestEnqueueTime();
+  bool delay_is_exceeded =
+      (pending_batch_delay_ns_ != 0) && (delay_ns >= pending_batch_delay_ns_);
+
+  // If we found a preferred batch size and the queue delay hasn't been
+  // exceeded, then execute that.
+  if ((best_preferred_batch_size != 0) && !delay_is_exceeded) {
+    if (pending_batch_delay_ns_ == 0) {
+      payload_saturated_ = true;
+    }
+    pending_batch_size_ = best_preferred_batch_size;
+    queue_.SetCursorToMark();
+    return 0;
+  }
+
+  // No request in pending batch happens when all queued requests have expired
+  // timeout and the policies are REJECT
+  if (queue_.PendingBatchCount() == 0) {
+    return 0;
+  }
+
+  // If the delay has been exceeded, or if the current batch can't grow
+  // any larger then just immediately execute whatever is pending.
+  if (send_now || ((payload_batch_size + pending_batch_size_) >=
+                   max_preferred_batch_size_)) {
+    payload_saturated_ = true;
+    return 0;
+  }
+
+  if (delay_is_exceeded || (pending_batch_delay_ns_ == 0)) {
+    return 0;
+  }
+
+  // Set the next preferred batch size given the pending batch size
+  auto next_preferred_batch_size_it = preferred_batch_sizes_.upper_bound(
+      pending_batch_size_ + payload_batch_size);
+  if (next_preferred_batch_size_it != preferred_batch_sizes_.end()) {
+    next_preferred_batch_size_ = *next_preferred_batch_size_it;
+  } else {
+    next_preferred_batch_size_ =
+        preferred_batch_sizes_.empty() ? 0 : *preferred_batch_sizes_.begin();
+  }
+  if (next_preferred_batch_size_ != 0) {
+    next_preferred_batch_size_ -= payload_batch_size;
+  }
+
+  // By this point, we have not seen the pending batch that should be executed
+  // immediately. However, if we have scheduled a payload that can be grown and
+  // not yet in preferred batch size, we should move the pending batch over to
+  // ensure the model instance will pick up largest available batch even if it
+  // is not the preferred batch.
+  if (!payload_saturated_ && (payload_batch_size != 0) &&
+      (preferred_batch_sizes_.find(payload_batch_size) ==
+       preferred_batch_sizes_.end())) {
+    return 0;
+  }
+
+  uint64_t wait_ns = pending_batch_delay_ns_ - delay_ns;
+  // Note that taking request timeout into consideration allows us to reset
+  // pending batch as soon as it is invalidated. But the cost is that in edge
+  // case where the timeout will be expired one by one, the thread will be
+  // waken frequently.
+  if (queue_.ClosestTimeout() != 0) {
+    if (now_ns <= queue_.ClosestTimeout()) {
+      wait_ns = std::min(queue_.ClosestTimeout() - now_ns, wait_ns);
+    } else {
+      // A request in pending batch is timed-out, wait for 1 us to force the
+      // thread to reset the pending batch right the way.
+      wait_ns = 1000;
+    }
+  }
+
+  // Return non-zero wait microseconds to cause this thread to wait
+  // until the queue delay or the closest timeout has expired.
+  // Another thread may be awaken due to incoming request to handle the
+  // pending batch before this thread wakes and that is ok. But if no other
+  // request comes in then this thread will wake and revisit the pending batch
+  // (and at that time will then see the delay has been exceeded and will send
+  // the batch).
+  return wait_ns / 1000;
+}
+
+void
+DynamicBatchScheduler::DelegateResponse(
+    std::unique_ptr<InferenceRequest>& request)
+{
+  std::lock_guard<std::mutex> lock(completion_queue_mtx_);
+  completion_queue_.emplace_back();
+  auto queue_slot = &completion_queue_.back();
+  // Pass raw ptr to lambda for tracking stats from cache and updating
+  // metric reporter on cache miss stats after insertion
+  InferenceRequest* raw_request_ptr = request.get();
+
+  request->SetResponseDelegator(
+      [this, queue_slot, raw_request_ptr](
+          std::unique_ptr<InferenceResponse>&& response, const uint32_t flags) {
+        if (response_cache_enabled_ && raw_request_ptr->CacheKeyIsSet()) {
+          // Cache insertion happens here because we need the backend to have
+          // computed the inference response first in the case of cache miss
+          auto cache = model_->Server()->GetResponseCache();
+          auto status = cache->Insert(*response, raw_request_ptr);
+          bool cache_miss =
+              (status.StatusCode() != Status::Code::ALREADY_EXISTS);
+          if (cache_miss) {
+#ifdef TRITON_ENABLE_STATS
+            // Update cache miss statistics even on failure to insert
+            // as we still spend time on lookup and attempting to insert
+            raw_request_ptr->ReportStatisticsCacheMiss(reporter_.get());
+#endif  // TRITON_ENABLE_STATS
+
+            if (!status.IsOk()) {
+              LOG_ERROR << raw_request_ptr->LogRequest()
+                        << "Failed to insert request_hash ["
+                        << raw_request_ptr->CacheKey()
+                        << "] into response cache: " << status.Message();
+            }
+          }  // Otherwise do nothing; we update cache hit statistics on Lookup
+        }
+
+        if (preserve_ordering_) {
+          {
+            std::lock_guard<std::mutex> lock(completion_queue_mtx_);
+            queue_slot->emplace_back(std::move(response), flags);
+          }
+          FinalizeResponses();
+        } else {
+          InferenceResponse::Send(std::move(response), flags);
+        }
+      });
+}
+
+void
+DynamicBatchScheduler::CacheLookUp(
+    std::unique_ptr<InferenceRequest>& request,
+    std::unique_ptr<InferenceResponse>& cached_response)
+{
+  auto cache = model_->Server()->GetResponseCache();
+  // Lookup request in cache
+  std::unique_ptr<InferenceResponse> local_response;
+  request->ResponseFactory()->CreateResponse(&local_response);
+  auto status = cache->Lookup(local_response.get(), request.get());
+  if (status.IsOk() && (local_response != nullptr)) {
+    cached_response = std::move(local_response);
+#ifdef TRITON_ENABLE_STATS
+    // Update model metrics/stats on cache hits
+    // Backends will update metrics as normal on cache misses
+    request->ReportStatisticsCacheHit(reporter_.get());
+#endif  // TRITON_ENABLE_STATS
+  }
+}
+
+void
+DynamicBatchScheduler::FinalizeResponses()
+{
+  // Need exclusive access of the function to ensure responses are sent
+  // in order
+  std::lock_guard<std::mutex> lock(finalize_mtx_);
+  // Finalize the completed payloads in-order as far as possible
+  std::deque<std::pair<std::unique_ptr<InferenceResponse>, const uint32_t>>
+      responses;
+  {
+    std::lock_guard<std::mutex> queue_lock(completion_queue_mtx_);
+    while (!completion_queue_.empty() && !completion_queue_.front().empty()) {
+      bool response_complete = false;
+      for (auto& response_pair : completion_queue_.front()) {
+        // Assuming FINAL flag is set only in the last response of the request
+        response_complete =
+            ((response_pair.second & TRITONSERVER_RESPONSE_COMPLETE_FINAL) !=
+             0);
+        responses.emplace_back(std::move(response_pair));
+      }
+      if (response_complete) {
+        completion_queue_.pop_front();
+      } else {
+        completion_queue_.front().clear();
+      }
+    }
+  }
+
+  for (auto& response : responses) {
+    InferenceResponse::Send(std::move(response.first), response.second);
+  }
+}
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/dynamic_batch_scheduler.h
+++ b/3rdparty/core-r22.12/src/dynamic_batch_scheduler.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <future>
+#include <map>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include "backend_model.h"
+#include "backend_model_instance.h"
+#include "model_config.pb.h"
+#include "rate_limiter.h"
+#include "scheduler.h"
+#include "scheduler_utils.h"
+#include "status.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+// Scheduler that implements dynamic batching.
+class DynamicBatchScheduler : public Scheduler {
+ public:
+  // Create a scheduler to support a given number of runners and a run
+  // function to call when a request is scheduled.
+  static Status Create(
+      TritonModel* model, TritonModelInstance* model_instance, const int nice,
+      const bool dynamic_batching_enabled, const int32_t max_batch_size,
+      const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+      const bool preserve_ordering, const bool response_cache_enable,
+      const std::set<int32_t>& preferred_batch_sizes,
+      const uint64_t max_queue_delay_microseconds,
+      std::unique_ptr<Scheduler>* scheduler);
+
+  // Create a scheduler to support a given number of runners and a run
+  // function to call when a request is scheduled. And the scheduler also
+  // supports different queue policies for different priority levels.
+  static Status Create(
+      TritonModel* model, TritonModelInstance* model_instance, const int nice,
+      const bool dynamic_batching_enabled, const int32_t max_batch_size,
+      const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+      const inference::ModelDynamicBatching& batcher_config,
+      const bool response_cache_enable, std::unique_ptr<Scheduler>* scheduler);
+
+  ~DynamicBatchScheduler();
+
+  // \see Scheduler::Enqueue()
+  Status Enqueue(std::unique_ptr<InferenceRequest>& request) override;
+
+  // \see Scheduler::InflightInferenceCount()
+  size_t InflightInferenceCount() override
+  {
+    std::unique_lock<std::mutex> lock(mu_);
+    if (curr_payload_ != nullptr) {
+      return queue_.Size() + curr_payload_->RequestCount();
+    }
+    return queue_.Size();
+  }
+
+  // \see Scheduler::Stop()
+  void Stop() override { stop_ = true; }
+
+  MetricModelReporter* MetricReporter() const { return reporter_.get(); }
+
+ private:
+  DynamicBatchScheduler(
+      TritonModel* model, TritonModelInstance* model_instance,
+      const bool dynamic_batching_enabled, const int32_t max_batch_size,
+      const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+      const bool preserve_ordering, const bool response_cache_enable,
+      const std::set<int32_t>& preferred_batch_sizes,
+      const uint64_t max_queue_delay_microseconds,
+      const inference::ModelQueuePolicy& default_queue_policy,
+      const uint32_t priority_levels,
+      const ModelQueuePolicyMap& queue_policy_map);
+
+  void BatcherThread(const int nice);
+  void NewPayload();
+  uint64_t GetDynamicBatch();
+  void DelegateResponse(std::unique_ptr<InferenceRequest>& request);
+  void CacheLookUp(
+      std::unique_ptr<InferenceRequest>& request,
+      std::unique_ptr<InferenceResponse>& cached_response);
+  void FinalizeResponses();
+
+  TritonModel* model_;
+  TritonModelInstance* model_instance_;
+
+  // Name of the model.
+  std::string model_name_;
+
+  // True if dynamic batching is enabled.
+  const bool dynamic_batching_enabled_;
+
+  // Map from priority level to queue holding inference requests for the model
+  // represented by this scheduler. If priority queues are not supported by the
+  // scheduler, then priority zero entry is used as the single queue.
+  PriorityQueue queue_;
+  bool stop_;
+
+  std::thread scheduler_thread_;
+  std::atomic<bool> scheduler_thread_exit_;
+
+  // Mutex and condvar for signaling scheduler thread
+  std::mutex mu_;
+  std::condition_variable cv_;
+
+  std::shared_ptr<RateLimiter> rate_limiter_;
+
+  std::shared_ptr<Payload> curr_payload_;
+  bool payload_saturated_;
+
+  size_t max_batch_size_;
+  size_t max_preferred_batch_size_;
+  std::set<int32_t> preferred_batch_sizes_;
+  uint64_t pending_batch_delay_ns_;
+  size_t pending_batch_size_;
+
+  size_t queued_batch_size_;
+  size_t next_preferred_batch_size_;
+
+  // The input tensors that require shape checking before being
+  // allowed in a batch. As a map from the tensor name to a bool. If
+  // tensor is in map then its shape must match shape of same tensor
+  // in requests already in the batch. If value is "true" then
+  // additional tensor is treated as a shape tensor and the values
+  // contained in the shape tensor must match same tensor already in
+  // the batch.
+  const std::unordered_map<std::string, bool> enforce_equal_shape_tensors_;
+
+  // Store information on whether the model contains optional inputs.
+  bool has_optional_input_;
+
+  // If true the ordering of responses matches the order of requests
+  // even when there are multiple scheduler threads.
+  const bool preserve_ordering_;
+
+  // If true, the scheduler will try to retrieve responses from cache.
+  bool response_cache_enabled_;
+
+  // Per completion-id queues to store the ready responses
+  std::deque<
+      std::vector<std::pair<std::unique_ptr<InferenceResponse>, uint32_t>>>
+      completion_queue_;
+  // Lock to protect the completion_queues_
+  std::mutex completion_queue_mtx_;
+
+  // Preserves the order in which responses are finalized
+  std::mutex finalize_mtx_;
+
+  // Reporter for metrics, or nullptr if no metrics should be reported
+  std::shared_ptr<MetricModelReporter> reporter_;
+};
+
+}}  // namespace triton::core