Adapt to 0.1.0

0a21fff9 · xiabo · 9484fd1c · 0a21fff9 · 0a21fff9 · 0a21fff9
Commit 0a21fff9 authored Dec 20, 2023 by xiabo
20 changed files
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_model_instance.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_model_instance.h
+// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+class BackendModel;
+
+//
+// BackendModelInstance
+//
+// Common functionality for a backend model instance. This class is
+// provided as a convenience; backends are not required to use this
+// class.
+//
+class BackendModelInstance {
+ public:
+  BackendModelInstance(
+      BackendModel* backend_model,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+  virtual ~BackendModelInstance();
+
+  // Get the name, kind and device ID of the instance.
+  const std::string& Name() const { return name_; }
+  TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
+  int32_t DeviceId() const { return device_id_; }
+
+  // Get the handle to the TRITONBACKEND model instance.
+  TRITONBACKEND_ModelInstance* TritonModelInstance()
+  {
+    return triton_model_instance_;
+  }
+
+  // Get the BackendModel representing the model that corresponds to
+  // this instance.
+  BackendModel* Model() const { return backend_model_; }
+
+  // The model configuration 'default_model_filename' value, or the
+  // value in model configuration 'cc_model_filenames' for the GPU
+  // targeted by this instance. If neither are specified in the model
+  // configuration, the return empty string.
+  const std::string& ArtifactFilename() const { return artifact_filename_; }
+
+  // Returns the stream associated with this instance that can be used
+  // for GPU<->CPU memory transfers. Returns nullptr if GPU support is
+  // disabled or if this instance is not executing on a GPU.
+  cudaStream_t CudaStream() { return stream_; }
+
+  const std::string& HostPolicyName() const { return host_policy_name_; }
+
+ protected:
+  BackendModel* backend_model_;
+  TRITONBACKEND_ModelInstance* triton_model_instance_;
+
+  std::string name_;
+  TRITONSERVER_InstanceGroupKind kind_;
+  int32_t device_id_;
+
+  std::string artifact_filename_;
+  cudaStream_t stream_;
+
+  std::string host_policy_name_;
+};
+
+//
+// BackendModelInstanceException
+//
+// Exception thrown if error occurs while constructing an
+// BackendModelInstance.
+//
+struct BackendModelInstanceException {
+  BackendModelInstanceException(TRITONSERVER_Error* err) : err_(err) {}
+  TRITONSERVER_Error* err_;
+};
+
+#define THROW_IF_BACKEND_INSTANCE_ERROR(X)                             \
+  do {                                                                 \
+    TRITONSERVER_Error* tie_err__ = (X);                               \
+    if (tie_err__ != nullptr) {                                        \
+      throw triton::backend::BackendModelInstanceException(tie_err__); \
+    }                                                                  \
+  } while (false)
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_output_responder.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_output_responder.h
+// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <list>
+#include <string>
+#include <vector>
+#include "triton/backend/backend_common.h"
+#include "triton/common/async_work_queue.h"
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+using cudaEvent_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+//
+// BackendOutputResponder
+//
+class BackendOutputResponder {
+ public:
+  // The caller can optionally provide 'event' for internal synchronization
+  // instead of using 'stream'.
+  explicit BackendOutputResponder(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      TRITONBACKEND_MemoryManager* memory_manager,
+      const bool first_dim_batching, const bool pinned_enabled,
+      cudaStream_t stream, cudaEvent_t event = nullptr,
+      bool copy_on_stream = false)
+      : need_sync_(false), requests_(requests), request_count_(request_count),
+        responses_(responses), memory_manager_(memory_manager),
+        first_dim_batching_(first_dim_batching),
+        pinned_enabled_(pinned_enabled),
+        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
+        stream_(stream), event_(event), pending_pinned_byte_size_(0),
+        copy_on_stream_(copy_on_stream)
+  {
+  }
+
+  // Legacy constructor for backwards compatibility. The above
+  // constructor should be used for all new cases. The responder needs
+  // to know if the model is batching along the first dimension. With
+  // this constructor we derive that information from the
+  // max_batch_size value instead of having it provided directly as in
+  // the above constructor.
+  explicit BackendOutputResponder(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses, const int max_batch_size,
+      TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
+      cudaStream_t stream, cudaEvent_t event = nullptr,
+      bool copy_on_stream = false)
+      : need_sync_(false), requests_(requests), request_count_(request_count),
+        responses_(responses), memory_manager_(memory_manager),
+        first_dim_batching_(max_batch_size >= 1),
+        pinned_enabled_(pinned_enabled),
+        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
+        stream_(stream), event_(event), pending_pinned_byte_size_(0),
+        copy_on_stream_(copy_on_stream)
+  {
+  }
+
+  ~BackendOutputResponder();
+
+  // Process all responses for a named output tensor.
+  // 'batchn_shape' may be modified by the call.
+  void ProcessTensor(
+      const std::string& name, const TRITONSERVER_DataType datatype,
+      std::vector<int64_t>& batchn_shape, const char* buffer,
+      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
+
+  // Process all responses for a named state tensor. Returns a vector of
+  // TRITONBACKEND_State objects that the backend can use to update the state.
+  // If TRITONBACKEND_StateUpdate is not called on the vector elements, the
+  // state will not be updated.
+  // 'batchn_shape' may be modified by the call.
+  std::vector<TRITONBACKEND_State*> ProcessStateTensor(
+      const std::string& name, const TRITONSERVER_DataType datatype,
+      std::vector<int64_t>& batchn_shape, const char* buffer,
+      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
+
+  // Process all responses for a batch output and derive its value from
+  // 'buffer'.
+  void ProcessBatchOutput(
+      const std::string& name, const BatchOutput& batch_output,
+      const char* buffer, const TRITONSERVER_MemoryType memory_type,
+      const int64_t memory_type_id);
+
+  // Finalize processing of all responses for all output
+  // tensors. Return true if cudaMemcpyAsync is called, and the caller
+  // should call cudaStreamSynchronize (or cudaEventSynchronize on 'event')
+  // before using the data.
+  bool Finalize();
+
+ private:
+  bool FlushPendingPinned(
+      const char* tensor_buffer,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id);
+  bool SetFixedSizeBuffer(
+      TRITONBACKEND_Response** response, void* response_state_or_output,
+      const std::string& output_name, const size_t tensor_byte_size,
+      const size_t tensor_offset, const char* tensor_buffer,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id,
+      const TRITONSERVER_MemoryType use_pinned_memory_type, bool state);
+
+  struct OutputData {
+    OutputData(
+        const std::string& name, void* buffer, const size_t buffer_byte_size,
+        const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+        : name_(name), buffer_(buffer), buffer_byte_size_(buffer_byte_size),
+          memory_type_(memory_type), memory_type_id_(memory_type_id)
+    {
+    }
+    const std::string name_;
+    void* buffer_;
+    const size_t buffer_byte_size_;
+    const TRITONSERVER_MemoryType memory_type_;
+    const int64_t memory_type_id_;
+  };
+
+  bool need_sync_;
+  TRITONBACKEND_Request** requests_;
+  const uint32_t request_count_;
+  std::vector<TRITONBACKEND_Response*>* responses_;
+  TRITONBACKEND_MemoryManager* memory_manager_;
+  const bool first_dim_batching_;
+  const bool pinned_enabled_;
+  const bool use_async_cpu_copy_;
+  cudaStream_t stream_;
+  cudaEvent_t event_;
+
+  using ResponsesList =
+      std::list<std::pair<TRITONBACKEND_Response**, OutputData>>;
+
+  size_t pending_pinned_byte_size_;
+  size_t pending_pinned_offset_;
+  ResponsesList pending_pinned_outputs_;
+  const bool copy_on_stream_;
+
+  // Pinned memories that need to live over the lifetime of this
+  // BackendOutputResponder object.
+  std::list<char*> pinned_memories_;
+
+  // Pinned memory buffers and the corresponding response outputs
+  // where the final copy to the response is deferred until Finalize()
+  // after waiting for all in-flight copies.
+  struct DeferredPinned {
+    DeferredPinned(
+        char* pinned_memory, const size_t pinned_memory_size,
+        ResponsesList&& responses)
+        : pinned_memory_(pinned_memory),
+          pinned_memory_size_(pinned_memory_size),
+          responses_(std::move(responses))
+    {
+    }
+    char* pinned_memory_;
+    const size_t pinned_memory_size_;
+    ResponsesList responses_;
+  };
+
+  std::list<DeferredPinned> deferred_pinned_;
+};
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_common.cc
+++ b/3rdparty/backend-r22.12/src/backend_common.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_common.h"
+
+#ifdef _WIN32
+// suppress the min and max definitions in Windef.h.
+#define NOMINMAX
+#include <Windows.h>
+
+// _CRT_INTERNAL_NONSTDC_NAMES 1 before including Microsoft provided C Runtime
+// library to expose declarations without "_" prefix to match POSIX style.
+#define _CRT_INTERNAL_NONSTDC_NAMES 1
+#include <direct.h>
+#include <io.h>
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+#include <algorithm>
+#include <cerrno>
+#include <fstream>
+#include <functional>
+#include <memory>
+
+#ifdef _WIN32
+// <sys/stat.h> in Windows doesn't define S_ISDIR macro
+#if !defined(S_ISDIR) && defined(S_IFMT) && defined(S_IFDIR)
+#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+#endif
+#define F_OK 0
+#endif
+
+namespace triton { namespace backend {
+
+#ifdef TRITON_ENABLE_GPU
+void CUDART_CB
+MemcpyHost(void* args)
+{
+  auto* copy_params = reinterpret_cast<CopyParams*>(args);
+  memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);
+  delete copy_params;
+}
+#endif  // TRITON_ENABLE_GPU
+
+TRITONSERVER_MemoryType
+GetUsePinnedMemoryType(TRITONSERVER_MemoryType ref_buffer_type)
+{
+  // The following matrix is used for both input and output.
+  // src   \ dest | non-pinned    | pinned     | device
+  // non-pinned   | memcpy        | memcpy     | buffer needed
+  // pinned       | memcpy        | memcpy     | cudaMemcpy
+  // device       | buffer needed | cudaMemcpy | cudaMemcpy
+  if (ref_buffer_type == TRITONSERVER_MEMORY_CPU_PINNED) {
+    return TRITONSERVER_MEMORY_CPU_PINNED;
+  }
+
+  return (ref_buffer_type == TRITONSERVER_MEMORY_CPU) ? TRITONSERVER_MEMORY_GPU
+                                                      : TRITONSERVER_MEMORY_CPU;
+}
+
+TRITONSERVER_Error_Code
+StatusCodeToTritonCode(triton::common::Error::Code error_code)
+{
+  switch (error_code) {
+    case triton::common::Error::Code::UNKNOWN:
+      return TRITONSERVER_ERROR_UNKNOWN;
+    case triton::common::Error::Code::INTERNAL:
+      return TRITONSERVER_ERROR_INTERNAL;
+    case triton::common::Error::Code::NOT_FOUND:
+      return TRITONSERVER_ERROR_NOT_FOUND;
+    case triton::common::Error::Code::INVALID_ARG:
+      return TRITONSERVER_ERROR_INVALID_ARG;
+    case triton::common::Error::Code::UNAVAILABLE:
+      return TRITONSERVER_ERROR_UNAVAILABLE;
+    case triton::common::Error::Code::UNSUPPORTED:
+      return TRITONSERVER_ERROR_UNSUPPORTED;
+    case triton::common::Error::Code::ALREADY_EXISTS:
+      return TRITONSERVER_ERROR_ALREADY_EXISTS;
+
+    default:
+      break;
+  }
+
+  return TRITONSERVER_ERROR_UNKNOWN;
+}
+
+TRITONSERVER_Error*
+CommonErrorToTritonError(triton::common::Error error)
+{
+  return TRITONSERVER_ErrorNew(
+      StatusCodeToTritonCode(error.ErrorCode()), error.Message().c_str());
+}
+
+TRITONSERVER_Error*
+ParseShape(
+    common::TritonJson::Value& io, const std::string& name,
+    std::vector<int64_t>* shape)
+{
+  common::TritonJson::Value shape_array;
+  RETURN_IF_ERROR(io.MemberAsArray(name.c_str(), &shape_array));
+  for (size_t i = 0; i < shape_array.ArraySize(); ++i) {
+    int64_t d = 0;
+    RETURN_IF_ERROR(shape_array.IndexAsInt(i, &d));
+    shape->push_back(d);
+  }
+
+  return nullptr;  // success
+}
+
+std::string
+ShapeToString(const int64_t* dims, const size_t dims_count)
+{
+  bool first = true;
+
+  std::string str("[");
+  for (size_t i = 0; i < dims_count; ++i) {
+    const int64_t dim = dims[i];
+    if (!first) {
+      str += ",";
+    }
+    str += std::to_string(dim);
+    first = false;
+  }
+
+  str += "]";
+  return str;
+}
+
+std::string
+ShapeToString(const std::vector<int64_t>& shape)
+{
+  return ShapeToString(shape.data(), shape.size());
+}
+
+int64_t
+GetElementCount(const int64_t* dims, const size_t dims_count)
+{
+  bool first = true;
+  int64_t cnt = 0;
+  for (size_t i = 0; i < dims_count; i++) {
+    if (dims[i] == WILDCARD_DIM) {
+      return -1;
+    }
+
+    if (first) {
+      cnt = dims[i];
+      first = false;
+    } else {
+      cnt *= dims[i];
+    }
+  }
+
+  return cnt;
+}
+
+int64_t
+GetElementCount(const std::vector<int64_t>& shape)
+{
+  return GetElementCount(shape.data(), shape.size());
+}
+
+int64_t
+GetByteSize(
+    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims)
+{
+  size_t dt_size = TRITONSERVER_DataTypeByteSize(dtype);
+  if (dt_size == 0) {
+    return -1;
+  }
+
+  int64_t cnt = GetElementCount(dims);
+  if (cnt == -1) {
+    return -1;
+  }
+
+  return cnt * dt_size;
+}
+
+TRITONSERVER_Error*
+ReadInputTensor(
+    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
+    size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,
+    const char* host_policy_name, const bool copy_on_stream)
+{
+  TRITONBACKEND_Input* input;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_RequestInput(request, input_name.c_str(), &input));
+
+  uint64_t input_byte_size;
+  uint32_t input_buffer_count;
+  RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+      input, host_policy_name, nullptr, nullptr, nullptr, nullptr,
+      &input_byte_size, &input_buffer_count));
+  RETURN_ERROR_IF_FALSE(
+      input_byte_size <= *buffer_byte_size, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string(
+          GetRequestId(request) + "buffer too small for input tensor '" +
+          input_name + "', " + std::to_string(*buffer_byte_size) + " < " +
+          std::to_string(input_byte_size)));
+
+  size_t output_buffer_offset = 0;
+  for (uint32_t b = 0; b < input_buffer_count; ++b) {
+    const void* input_buffer = nullptr;
+    uint64_t input_buffer_byte_size = 0;
+    TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;
+    int64_t input_memory_type_id = 0;
+
+    RETURN_IF_ERROR(TRITONBACKEND_InputBufferForHostPolicy(
+        input, host_policy_name, b, &input_buffer, &input_buffer_byte_size,
+        &input_memory_type, &input_memory_type_id));
+
+    RETURN_IF_ERROR(CopyBuffer(
+        "Failed to copy buffer", input_memory_type, input_memory_type_id,
+        memory_type, memory_type_id, input_buffer_byte_size, input_buffer,
+        buffer + output_buffer_offset, cuda_stream, cuda_used, copy_on_stream));
+
+    output_buffer_offset += input_buffer_byte_size;
+  }
+
+  *buffer_byte_size = input_byte_size;
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ReadInputTensor(
+    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
+    size_t* buffer_byte_size, const char* host_policy_name)
+{
+  bool cuda_used;
+  return ReadInputTensor(
+      request, input_name, buffer, buffer_byte_size,
+      TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */,
+      0 /* cuda_stream */, &cuda_used);
+}
+
+TRITONSERVER_Error*
+CheckAllowedModelInput(
+    common::TritonJson::Value& io, const std::set<std::string>& allowed)
+{
+  std::string io_name;
+  RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+  if (allowed.find(io_name) == allowed.end()) {
+    std::string astr;
+    for (const auto& a : allowed) {
+      if (!astr.empty()) {
+        astr.append(", ");
+      }
+      astr.append(a);
+    }
+
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        std::string(
+            "unexpected inference input '" + io_name +
+            "', allowed inputs are: " + astr)
+            .c_str());
+  }
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+CheckAllowedModelOutput(
+    common::TritonJson::Value& io, const std::set<std::string>& allowed)
+{
+  std::string io_name;
+  RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+  if (allowed.find(io_name) == allowed.end()) {
+    std::string astr;
+    for (const auto& a : allowed) {
+      if (!astr.empty()) {
+        astr.append(", ");
+      }
+      astr.append(a);
+    }
+
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        std::string(
+            "unexpected inference output '" + io_name +
+            "', allowed outputs are: " + astr)
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+GetBooleanSequenceControlProperties(
+    common::TritonJson::Value& batcher, const std::string& model_name,
+    const std::string& control_kind, const bool required,
+    std::string* tensor_name, std::string* tensor_datatype,
+    float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,
+    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value)
+{
+  // Make sure same tensor is not configured for multiple controls
+  std::set<std::string> seen_tensors;
+
+  // Make sure the control kind is not mentioned multiple times.
+  bool seen_control = false;
+
+  common::TritonJson::Value control_inputs;
+  if (batcher.Find("control_input", &control_inputs)) {
+    for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {
+      common::TritonJson::Value control_input;
+      RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));
+      std::string input_name;
+      RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name));
+      if (input_name.empty()) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string(
+                 "sequence batching control tensor must have a name for ") +
+             model_name)
+                .c_str());
+      }
+
+      if (seen_tensors.find(input_name) != seen_tensors.end()) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("sequence batching control tensor '") + input_name +
+             "' is specified for multiple control kinds for " + model_name)
+                .c_str());
+      }
+
+      seen_tensors.insert(input_name);
+      common::TritonJson::Value controls;
+      if (control_input.Find("control", &controls)) {
+        for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {
+          common::TritonJson::Value c;
+          RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));
+          std::string kind_str;
+          RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str));
+          if (kind_str == control_kind) {
+            if (seen_control) {
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string(
+                       "sequence batching specifies multiple " + control_kind +
+                       " tensors for " + model_name)
+                       .c_str()));
+            }
+
+            *tensor_name = input_name;
+            seen_control = true;
+
+            common::TritonJson::Value int32_false_true, fp32_false_true,
+                bool_false_true;
+            bool found_int32 =
+                (c.Find("int32_false_true", &int32_false_true) &&
+                 (int32_false_true.ArraySize() > 0));
+            bool found_fp32 =
+                (c.Find("fp32_false_true", &fp32_false_true) &&
+                 (fp32_false_true.ArraySize() > 0));
+            bool found_bool =
+                (c.Find("bool_false_true", &bool_false_true) &&
+                 (bool_false_true.ArraySize() > 0));
+
+            // Make sure only one of int, float, or bool type is specified.
+            if (!(found_int32 || found_fp32 || found_bool)) {
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string(
+                       "sequence batching must specify either "
+                       "'int32_false_true', 'fp32_false_true' or "
+                       "'bool_false_true' for " +
+                       control_kind + " for " + model_name))
+                      .c_str());
+            } else if (
+                (found_fp32 && found_int32) || (found_fp32 && found_bool) ||
+                (found_int32 && found_bool)) {
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string(
+                       "sequence batching specifies more than one from "
+                       "'int32_false_true', 'fp32_false_true' and "
+                       "'bool_false_true' for " +
+                       control_kind + " for " + model_name))
+                      .c_str());
+            }
+
+            if (found_int32) {
+              if (int32_false_true.ArraySize() != 2) {
+                return TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_INVALID_ARG,
+                    (std::string(
+                         "sequence batching control 'int32_false_true' must "
+                         "have "
+                         "exactly 2 entries for " +
+                         control_kind + " for " + model_name))
+                        .c_str());
+              }
+              if (tensor_datatype != nullptr) {
+                *tensor_datatype = "TYPE_INT32";
+              }
+              if (int32_false_value != nullptr) {
+                int64_t value;
+                RETURN_IF_ERROR(int32_false_true.IndexAsInt(0, &value));
+                *int32_false_value = value;
+              }
+              if (int32_true_value != nullptr) {
+                int64_t value;
+                RETURN_IF_ERROR(int32_false_true.IndexAsInt(1, &value));
+                *int32_true_value = value;
+              }
+            } else if (found_fp32) {
+              if (fp32_false_true.ArraySize() != 2) {
+                return TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_INVALID_ARG,
+                    (std::string(
+                         "sequence batching control 'fp32_false_true' must "
+                         "have exactly "
+                         "2 entries for " +
+                         control_kind + " for " + model_name))
+                        .c_str());
+              }
+              if (tensor_datatype != nullptr) {
+                *tensor_datatype = "TYPE_FP32";
+              }
+              if (fp32_false_value != nullptr) {
+                double value = 0.0;
+                RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(0, &value));
+                *fp32_false_value = value;
+              }
+              if (fp32_true_value != nullptr) {
+                double value = 0.0;
+                RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(1, &value));
+                *fp32_true_value = value;
+              }
+            } else {
+              if (bool_false_true.ArraySize() != 2) {
+                return TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_INVALID_ARG,
+                    (std::string(
+                         "sequence batching control 'bool_false_true' must "
+                         "have exactly "
+                         "2 entries for " +
+                         control_kind + " for " + model_name))
+                        .c_str());
+              }
+              if (tensor_datatype != nullptr) {
+                *tensor_datatype = "TYPE_BOOL";
+              }
+              if (bool_false_value != nullptr) {
+                bool value;
+                RETURN_IF_ERROR(bool_false_true.IndexAsBool(0, &value));
+                *bool_false_value = value;
+              }
+              if (bool_true_value != nullptr) {
+                bool value;
+                RETURN_IF_ERROR(bool_false_true.IndexAsBool(1, &value));
+                *bool_true_value = value;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!seen_control) {
+    if (required) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string(
+               "sequence batching control tensor must specify a " +
+               control_kind + " value for " + model_name))
+              .c_str());
+    }
+
+    tensor_name->clear();
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+GetTypedSequenceControlProperties(
+    common::TritonJson::Value& batcher, const std::string& model_name,
+    const std::string& control_kind, const bool required,
+    std::string* tensor_name, std::string* tensor_datatype)
+{
+  // Make sure same tensor is not configured for multiple controls
+  std::set<std::string> seen_tensors;
+
+  // Make sure the control kind is not mentioned multiple times.
+  bool seen_control = false;
+
+  common::TritonJson::Value control_inputs;
+  if (batcher.Find("control_input", &control_inputs)) {
+    for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {
+      common::TritonJson::Value control_input;
+      RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));
+      std::string input_name;
+      RETURN_IF_ERROR(control_input.MemberAsString("name", &input_name));
+      if (input_name.empty()) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string(
+                 "sequence batching control tensor must have a name for ") +
+             model_name)
+                .c_str());
+      }
+      if (seen_tensors.find(input_name) != seen_tensors.end()) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("sequence batching control tensor '") + input_name +
+             "' is specified for multiple control kinds for " + model_name)
+                .c_str());
+      }
+
+      seen_tensors.insert(input_name);
+      common::TritonJson::Value controls;
+      if (control_input.Find("control", &controls)) {
+        for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {
+          common::TritonJson::Value c;
+          RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));
+          std::string kind_str;
+          RETURN_IF_ERROR(c.MemberAsString("kind", &kind_str));
+          if (kind_str == control_kind) {
+            if (seen_control) {
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string(
+                       "sequence batching specifies multiple " + control_kind +
+                       " tensors for " + model_name)
+                       .c_str()));
+            }
+
+            *tensor_name = input_name;
+            if (tensor_datatype != nullptr) {
+              RETURN_IF_ERROR(c.MemberAsString("data_type", tensor_datatype));
+            }
+
+            seen_control = true;
+
+            common::TritonJson::Value int32_false_true, fp32_false_true,
+                bool_false_true;
+            bool found_int32 =
+                (c.Find("int32_false_true", &int32_false_true) &&
+                 (int32_false_true.ArraySize() > 0));
+            bool found_fp32 =
+                (c.Find("fp32_false_true", &fp32_false_true) &&
+                 (fp32_false_true.ArraySize() > 0));
+            bool found_bool =
+                (c.Find("bool_false_true", &bool_false_true) &&
+                 (bool_false_true.ArraySize() > 0));
+            if (found_fp32 || found_int32 || found_bool) {
+              return TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_INVALID_ARG,
+                  (std::string(
+                       "sequence batching must not specify either "
+                       "'int32_false_true', 'fp32_false_true' or "
+                       "'bool_false_true' for " +
+                       control_kind + " for " + model_name))
+                      .c_str());
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!seen_control) {
+    if (required) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (std::string(
+               "sequence batching control tensor must specify a " +
+               control_kind + " value for " + model_name))
+              .c_str());
+    }
+
+    tensor_name->clear();
+  }
+
+  return nullptr;  // success
+}
+
+void
+RequestsRespondWithError(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    TRITONSERVER_Error* response_err, const bool release_request)
+{
+  for (size_t i = 0; i < request_count; i++) {
+    TRITONBACKEND_Response* response;
+    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+    if (err != nullptr) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (GetRequestId(requests[i]) + "fail to create response").c_str());
+      TRITONSERVER_ErrorDelete(err);
+    } else {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),
+          (GetRequestId(requests[i]) + "fail to send error response").c_str());
+    }
+
+    if (release_request) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_RequestRelease(
+              requests[i], TRITONSERVER_REQUEST_RELEASE_ALL),
+          "fail to release request");
+      requests[i] = nullptr;
+    }
+  }
+
+  TRITONSERVER_ErrorDelete(response_err);
+}
+
+void
+SendErrorForResponses(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count, TRITONSERVER_Error* response_err)
+{
+  for (size_t i = 0; i < response_count; i++) {
+    TRITONBACKEND_Response* response = (*responses)[i];
+    if (response != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),
+          "fail to send error response");
+      (*responses)[i] = nullptr;
+    }
+  }
+
+  TRITONSERVER_ErrorDelete(response_err);
+}
+
+TRITONSERVER_Error*
+CopyBuffer(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, bool* cuda_used,
+    const bool copy_on_stream)
+{
+  *cuda_used = false;
+
+  if (byte_size > 0) {
+    if (src == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          std::string(
+              msg + ": attempted a copy of " + std::to_string(byte_size) +
+              " Bytes from an uninitialized memory")
+              .c_str());
+    }
+
+    if (dst == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          std::string(
+              msg + ": attempted a copy of " + std::to_string(byte_size) +
+              " Bytes to an uninitialized memory")
+              .c_str());
+    }
+  }
+
+
+  // For CUDA memcpy, if copy_on_stream is false, all host to host copy will be
+  // blocked in respect to the host, so use memcpy() directly. In this case,
+  // need to be careful on whether the src buffer is valid.
+  if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&
+      (dst_memory_type != TRITONSERVER_MEMORY_GPU)) {
+#ifdef TRITON_ENABLE_GPU
+    if (copy_on_stream) {
+      auto params = new CopyParams(dst, src, byte_size);
+      cudaLaunchHostFunc(
+          cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));
+      *cuda_used = true;
+    } else {
+      memcpy(dst, src, byte_size);
+    }
+#else
+    memcpy(dst, src, byte_size);
+#endif  // TRITON_ENABLE_GPU
+  } else {
+#ifdef TRITON_ENABLE_GPU
+    // [TODO] use cudaMemcpyDefault if UVM is supported for the device
+    auto copy_kind = cudaMemcpyDeviceToDevice;
+    if (src_memory_type != TRITONSERVER_MEMORY_GPU) {
+      copy_kind = cudaMemcpyHostToDevice;
+    } else if (dst_memory_type != TRITONSERVER_MEMORY_GPU) {
+      copy_kind = cudaMemcpyDeviceToHost;
+    }
+
+    if ((src_memory_type_id != dst_memory_type_id) &&
+        (copy_kind == cudaMemcpyDeviceToDevice)) {
+      RETURN_IF_CUDA_ERROR(
+          cudaMemcpyPeerAsync(
+              dst, dst_memory_type_id, src, src_memory_type_id, byte_size,
+              cuda_stream),
+          TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy");
+    } else {
+      RETURN_IF_CUDA_ERROR(
+          cudaMemcpyAsync(dst, src, byte_size, copy_kind, cuda_stream),
+          TRITONSERVER_ERROR_INTERNAL, msg + ": failed to perform CUDA copy");
+    }
+
+    *cuda_used = true;
+#else
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        std::string(msg + ": try to use CUDA copy while GPU is not supported")
+            .c_str());
+#endif  // TRITON_ENABLE_GPU
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+GetDirectoryContents(const std::string& path, std::set<std::string>* contents)
+{
+#ifdef _WIN32
+  WIN32_FIND_DATA entry;
+  HANDLE dir = FindFirstFile(path.c_str(), &entry);
+  if (dir == INVALID_HANDLE_VALUE) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("failed to open directory: ") + path).c_str());
+  }
+  if ((entry.cFileName != ".") && (entry.cFileName != "..")) {
+    contents->insert(entry.cFileName);
+  }
+  while (FindNextFileA(dir, &entry)) {
+    if ((entry.cFileName != ".") && (entry.cFileName != "..")) {
+      contents->insert(entry.cFileName);
+    }
+  }
+
+  FindClose(dir);
+#else
+  DIR* dir = opendir(path.c_str());
+  if (dir == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("failed to open directory: ") + path).c_str());
+  }
+
+  struct dirent* entry;
+  while ((entry = readdir(dir)) != nullptr) {
+    std::string entryname = entry->d_name;
+    if ((entryname != ".") && (entryname != "..")) {
+      contents->insert(entryname);
+    }
+  }
+
+  closedir(dir);
+#endif
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+FileExists(const std::string& path, bool* exists)
+{
+  *exists = (access(path.c_str(), F_OK) == 0);
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ReadTextFile(const std::string& path, std::string* contents)
+{
+  std::ifstream in(path, std::ios::in | std::ios::binary);
+  if (!in) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        ("failed to open/read file '" + path + "': " + strerror(errno))
+            .c_str());
+  }
+
+  in.seekg(0, std::ios::end);
+  contents->resize(in.tellg());
+  in.seekg(0, std::ios::beg);
+  in.read(&(*contents)[0], contents->size());
+  in.close();
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+IsDirectory(const std::string& path, bool* is_dir)
+{
+  *is_dir = false;
+
+  struct stat st;
+  if (stat(path.c_str(), &st) != 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("failed to stat file ") + path).c_str());
+  }
+
+  *is_dir = S_ISDIR(st.st_mode);
+  return nullptr;  // success
+}
+
+std::string
+JoinPath(std::initializer_list<std::string> segments)
+{
+  std::string joined;
+
+  for (const auto& seg : segments) {
+    if (joined.empty()) {
+      joined = seg;
+    } else if (!seg.empty() && (seg[0] == '/')) {  // IsAbsolutePath(seg)
+      if (joined[joined.size() - 1] == '/') {
+        joined.append(seg.substr(1));
+      } else {
+        joined.append(seg);
+      }
+    } else {  // !IsAbsolutePath(seg)
+      if (joined[joined.size() - 1] != '/') {
+        joined.append("/");
+      }
+      joined.append(seg);
+    }
+  }
+
+  return joined;
+}
+
+TRITONSERVER_Error*
+ModelPaths(
+    const std::string& model_repository_path, uint64_t version,
+    const bool ignore_directories, const bool ignore_files,
+    std::unordered_map<std::string, std::string>* model_paths)
+{
+  std::set<std::string> model_files;
+  // Read all the files in 'path' and filter by type for different requirements
+  auto path = JoinPath({model_repository_path, std::to_string(version)});
+  RETURN_IF_ERROR(GetDirectoryContents(path, &model_files));
+  if (ignore_directories) {
+    // Erase directory entries...
+    for (auto iter = model_files.begin(); iter != model_files.end();) {
+      bool is_dir;
+      RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));
+      if (is_dir) {
+        iter = model_files.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+  }
+  if (ignore_files) {
+    // Erase non-directory entries...
+    for (auto iter = model_files.begin(); iter != model_files.end();) {
+      bool is_dir;
+      RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));
+      if (!is_dir) {
+        iter = model_files.erase(iter);
+      } else {
+        ++iter;
+      }
+    }
+  }
+
+  for (const auto& filename : model_files) {
+    const auto model_path = JoinPath({path, filename});
+    model_paths->emplace(
+        std::piecewise_construct, std::make_tuple(filename),
+        std::make_tuple(model_path));
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+CreateCudaStream(
+    const int device_id, const int cuda_stream_priority, cudaStream_t* stream)
+{
+  *stream = nullptr;
+
+#ifdef TRITON_ENABLE_GPU
+  // Make sure that correct device is set before creating stream and
+  // then restore the device to what was set by the caller.
+  int current_device;
+  auto cuerr = cudaGetDevice(&current_device);
+  bool overridden = false;
+  if (cuerr == cudaSuccess) {
+    overridden = (current_device != device_id);
+    if (overridden) {
+      cuerr = cudaSetDevice(device_id);
+    }
+  }
+
+  if (cuerr == cudaSuccess) {
+    cuerr = cudaStreamCreateWithPriority(
+        stream, cudaStreamDefault, cuda_stream_priority);
+  }
+
+  if (overridden) {
+    cudaSetDevice(current_device);
+  }
+
+  if (cuerr != cudaSuccess) {
+    *stream = nullptr;
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("unable to create stream: ") + cudaGetErrorString(cuerr))
+            .c_str());
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ParseLongLongValue(const std::string& value, int64_t* parsed_value)
+{
+  try {
+    *parsed_value = std::stoll(value);
+  }
+  catch (const std::invalid_argument& ia) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("failed to convert '") + value +
+         "' to long long integral number")
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ParseUnsignedLongLongValue(const std::string& value, uint64_t* parsed_value)
+{
+  try {
+    *parsed_value = std::stoull(value);
+  }
+  catch (const std::invalid_argument& ia) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("failed to convert '") + value +
+         "' to unsigned long long integral number")
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ParseBoolValue(const std::string& value, bool* parsed_value)
+{
+  std::string lvalue = value;
+  std::transform(
+      lvalue.begin(), lvalue.end(), lvalue.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  if ((lvalue == "true") || (lvalue == "on") || (lvalue == "1")) {
+    *parsed_value = true;
+    return nullptr;  // success
+  }
+  if ((lvalue == "false") || (lvalue == "off") || (lvalue == "0")) {
+    *parsed_value = false;
+    return nullptr;  // success
+  }
+
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_INVALID_ARG,
+      (std::string("failed to convert '") + value + "' to boolean").c_str());
+}
+
+TRITONSERVER_Error*
+ParseIntValue(const std::string& value, int* parsed_value)
+{
+  try {
+    *parsed_value = std::stoi(value);
+  }
+  catch (const std::invalid_argument& ia) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("failed to convert '") + value + "' to integral number")
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ParseDoubleValue(const std::string& value, double* parsed_value)
+{
+  try {
+    *parsed_value = std::stod(value);
+  }
+  catch (const std::invalid_argument& ia) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        (std::string("failed to convert '") + value + "' to double number")
+            .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+GetParameterValue(
+    triton::common::TritonJson::Value& params, const std::string& key,
+    std::string* value)
+{
+  triton::common::TritonJson::Value json_value;
+  RETURN_ERROR_IF_FALSE(
+      params.Find(key.c_str(), &json_value), TRITONSERVER_ERROR_NOT_FOUND,
+      std::string("model configuration is missing the parameter ") + key);
+  RETURN_IF_ERROR(json_value.MemberAsString("string_value", value));
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+BatchInput::ParseFromModelConfig(
+    triton::common::TritonJson::Value& config,
+    std::vector<BatchInput>* batch_inputs)
+{
+  batch_inputs->clear();
+  triton::common::TritonJson::Value bis;
+  RETURN_IF_ERROR(config.MemberAsArray("batch_input", &bis));
+  for (size_t i = 0; i < bis.ArraySize(); ++i) {
+    triton::common::TritonJson::Value bi;
+    RETURN_IF_ERROR(bis.IndexAsObject(i, &bi));
+    batch_inputs->emplace_back();
+    RETURN_IF_ERROR(batch_inputs->back().Init(bi));
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+BatchInput::Init(triton::common::TritonJson::Value& bi_config)
+{
+  {
+    triton::common::TritonJson::Value bi_target_names;
+    RETURN_IF_ERROR(bi_config.MemberAsArray("target_name", &bi_target_names));
+    for (size_t i = 0; i < bi_target_names.ArraySize(); ++i) {
+      std::string tn;
+      RETURN_IF_ERROR(bi_target_names.IndexAsString(i, &tn));
+      target_names_.emplace_back(std::move(tn));
+    }
+  }
+  {
+    RETURN_IF_ERROR(bi_config.MemberAsString("kind", &kind_str_));
+    if (kind_str_ == "BATCH_ELEMENT_COUNT") {
+      kind_ = Kind::BATCH_ELEMENT_COUNT;
+    } else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT") {
+      kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT;
+    } else if (kind_str_ == "BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO") {
+      kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO;
+    } else if (kind_str_ == "BATCH_MAX_ELEMENT_COUNT_AS_SHAPE") {
+      kind_ = Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE;
+    } else if (kind_str_ == "BATCH_ITEM_SHAPE") {
+      kind_ = Kind::BATCH_ITEM_SHAPE;
+    } else if (kind_str_ == "BATCH_ITEM_SHAPE_FLATTEN") {
+      kind_ = Kind::BATCH_ITEM_SHAPE_FLATTEN;
+    } else {
+      RETURN_ERROR_IF_FALSE(
+          false, TRITONSERVER_ERROR_INVALID_ARG,
+          std::string("unexpected batch input kind '" + kind_str_ + "'"));
+    }
+  }
+  {
+    std::string bi_dtype;
+    RETURN_IF_ERROR(bi_config.MemberAsString("data_type", &bi_dtype));
+    data_type_ = ModelConfigDataTypeToTritonServerDataType(bi_dtype);
+    RETURN_ERROR_IF_TRUE(
+        data_type_ == TRITONSERVER_TYPE_INVALID, TRITONSERVER_ERROR_INVALID_ARG,
+        std::string("unexpected batch input data type '" + bi_dtype + "'"));
+  }
+  {
+    triton::common::TritonJson::Value bi_source_inputs;
+    RETURN_IF_ERROR(bi_config.MemberAsArray("source_input", &bi_source_inputs));
+    for (size_t i = 0; i < bi_source_inputs.ArraySize(); ++i) {
+      std::string si;
+      RETURN_IF_ERROR(bi_source_inputs.IndexAsString(i, &si));
+      source_inputs_.emplace_back(std::move(si));
+    }
+  }
+  return nullptr;  // success
+}
+
+TRITONSERVER_DataType
+ModelConfigDataTypeToTritonServerDataType(const std::string& data_type_str)
+{
+  // Must start with "TYPE_".
+  if (data_type_str.rfind("TYPE_", 0) != 0) {
+    return TRITONSERVER_TYPE_INVALID;
+  }
+
+  const std::string dtype = data_type_str.substr(strlen("TYPE_"));
+
+  if (dtype == "BOOL") {
+    return TRITONSERVER_TYPE_BOOL;
+  } else if (dtype == "UINT8") {
+    return TRITONSERVER_TYPE_UINT8;
+  } else if (dtype == "UINT16") {
+    return TRITONSERVER_TYPE_UINT16;
+  } else if (dtype == "UINT32") {
+    return TRITONSERVER_TYPE_UINT32;
+  } else if (dtype == "UINT64") {
+    return TRITONSERVER_TYPE_UINT64;
+  } else if (dtype == "INT8") {
+    return TRITONSERVER_TYPE_INT8;
+  } else if (dtype == "INT16") {
+    return TRITONSERVER_TYPE_INT16;
+  } else if (dtype == "INT32") {
+    return TRITONSERVER_TYPE_INT32;
+  } else if (dtype == "INT64") {
+    return TRITONSERVER_TYPE_INT64;
+  } else if (dtype == "FP16") {
+    return TRITONSERVER_TYPE_FP16;
+  } else if (dtype == "FP32") {
+    return TRITONSERVER_TYPE_FP32;
+  } else if (dtype == "FP64") {
+    return TRITONSERVER_TYPE_FP64;
+  } else if (dtype == "STRING") {
+    return TRITONSERVER_TYPE_BYTES;
+  } else if (dtype == "BF16") {
+    return TRITONSERVER_TYPE_BF16;
+  }
+
+  return TRITONSERVER_TYPE_INVALID;
+}
+
+TRITONSERVER_Error*
+BatchOutput::ParseFromModelConfig(
+    triton::common::TritonJson::Value& config,
+    std::vector<BatchOutput>* batch_outputs)
+{
+  batch_outputs->clear();
+  triton::common::TritonJson::Value bos;
+  RETURN_IF_ERROR(config.MemberAsArray("batch_output", &bos));
+  for (size_t i = 0; i < bos.ArraySize(); ++i) {
+    batch_outputs->emplace_back();
+    auto& batch_output = batch_outputs->back();
+    triton::common::TritonJson::Value bo;
+    RETURN_IF_ERROR(bos.IndexAsObject(i, &bo));
+    {
+      triton::common::TritonJson::Value bo_target_names;
+      RETURN_IF_ERROR(bo.MemberAsArray("target_name", &bo_target_names));
+      for (size_t i = 0; i < bo_target_names.ArraySize(); ++i) {
+        std::string tn;
+        RETURN_IF_ERROR(bo_target_names.IndexAsString(i, &tn));
+        batch_output.target_names_.emplace_back(std::move(tn));
+      }
+    }
+    {
+      std::string bo_kind;
+      RETURN_IF_ERROR(bo.MemberAsString("kind", &bo_kind));
+      if (bo_kind == "BATCH_SCATTER_WITH_INPUT_SHAPE") {
+        batch_output.kind_ = Kind::BATCH_SCATTER_WITH_INPUT_SHAPE;
+        // Keep track of the output info for later cross reference with input
+        int64_t mbs = 0;
+        RETURN_IF_ERROR(config.MemberAsInt("max_batch_size", &mbs));
+        if (mbs != 0) {
+          batch_output.shape_.push_back(-1);
+        }
+        triton::common::TritonJson::Value ios;
+        RETURN_IF_ERROR(config.MemberAsArray("output", &ios));
+        for (size_t i = 0; i < ios.ArraySize(); i++) {
+          triton::common::TritonJson::Value io;
+          RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+          std::string io_name;
+          RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+          if (io_name == batch_output.target_names_[0]) {
+            std::string io_dtype;
+            RETURN_IF_ERROR(io.MemberAsString("data_type", &io_dtype));
+            batch_output.data_type_ =
+                ModelConfigDataTypeToTritonServerDataType(io_dtype);
+            // If a reshape is provided for the input then use that when
+            // validating that the model matches what is expected.
+            triton::common::TritonJson::Value reshape;
+            if (io.Find("reshape", &reshape)) {
+              RETURN_IF_ERROR(
+                  ParseShape(reshape, "shape", &batch_output.shape_));
+            } else {
+              RETURN_IF_ERROR(ParseShape(io, "dims", &batch_output.shape_));
+            }
+            break;
+          }
+        }
+      } else {
+        RETURN_ERROR_IF_FALSE(
+            false, TRITONSERVER_ERROR_INVALID_ARG,
+            std::string("unexpected batch output kind '" + bo_kind + "'"));
+      }
+    }
+    {
+      triton::common::TritonJson::Value bo_source_inputs;
+      RETURN_IF_ERROR(bo.MemberAsArray("source_input", &bo_source_inputs));
+      for (size_t i = 0; i < bo_source_inputs.ArraySize(); ++i) {
+        std::string si;
+        RETURN_IF_ERROR(bo_source_inputs.IndexAsString(i, &si));
+        batch_output.source_inputs_.emplace_back(std::move(si));
+      }
+    }
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    std::string* value, const std::string& default_value)
+{
+  triton::common::TritonJson::Value json_value;
+  if (params.Find(mkey.c_str(), &json_value)) {
+    RETURN_IF_ERROR(json_value.MemberAsString("string_value", value));
+  } else {
+    *value = default_value;
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value, const int& default_value)
+{
+  triton::common::TritonJson::Value json_value;
+  if (params.Find(mkey.c_str(), &json_value)) {
+    std::string string_value;
+    RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
+    return ParseIntValue(string_value, value);
+  } else {
+    *value = default_value;
+    return nullptr;  // success
+  }
+}
+
+TRITONSERVER_Error*
+TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    bool* value, const bool& default_value)
+{
+  triton::common::TritonJson::Value json_value;
+  if (params.Find(mkey.c_str(), &json_value)) {
+    std::string string_value;
+    RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
+    return ParseBoolValue(string_value, value);
+  } else {
+    *value = default_value;
+    return nullptr;  // success
+  }
+}
+
+TRITONSERVER_Error*
+TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    uint64_t* value, const uint64_t& default_value)
+{
+  triton::common::TritonJson::Value json_value;
+  if (params.Find(mkey.c_str(), &json_value)) {
+    std::string string_value;
+    RETURN_IF_ERROR(json_value.MemberAsString("string_value", &string_value));
+    return ParseUnsignedLongLongValue(string_value, value);
+  } else {
+    *value = default_value;
+    return nullptr;  // success
+  }
+}
+
+namespace {
+
+template <typename T>
+TRITONSERVER_Error*
+BufferAsTypedString(
+    std::string& str, const char* buffer, const size_t element_cnt)
+{
+  const T* vals = reinterpret_cast<const T*>(buffer);
+
+  str += "[ ";
+  for (size_t i = 0; i < element_cnt; ++i) {
+    const T& v = vals[i];
+    if (i != 0) {
+      str += ", ";
+    }
+    str += std::to_string(v);
+  }
+
+  str += " ]";
+
+  return nullptr;  // success
+}
+
+}  // namespace
+
+
+TRITONSERVER_Error*
+BufferAsTypedString(
+    std::string& str, const char* buffer, size_t buffer_byte_size,
+    TRITONSERVER_DataType datatype)
+{
+  const size_t element_cnt =
+      buffer_byte_size / TRITONSERVER_DataTypeByteSize(datatype);
+
+  switch (datatype) {
+    case TRITONSERVER_TYPE_UINT8:
+      return BufferAsTypedString<uint8_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_UINT16:
+      return BufferAsTypedString<uint16_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_UINT32:
+      return BufferAsTypedString<uint32_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_UINT64:
+      return BufferAsTypedString<uint64_t>(str, buffer, element_cnt);
+
+    case TRITONSERVER_TYPE_INT8:
+      return BufferAsTypedString<int8_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_INT16:
+      return BufferAsTypedString<int16_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_INT32:
+      return BufferAsTypedString<int32_t>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_INT64:
+      return BufferAsTypedString<int64_t>(str, buffer, element_cnt);
+
+    case TRITONSERVER_TYPE_FP32:
+      return BufferAsTypedString<float>(str, buffer, element_cnt);
+    case TRITONSERVER_TYPE_FP64:
+      return BufferAsTypedString<double>(str, buffer, element_cnt);
+
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          std::string(
+              std::string("class result not available for output due to "
+                          "unsupported type '") +
+              std::string(TRITONSERVER_DataTypeString(datatype)) + "'")
+              .c_str());
+  }
+
+  return nullptr;  // success
+}
+
+std::string
+GetRequestId(TRITONBACKEND_Request* request)
+{
+  const char* request_id = nullptr;
+  LOG_IF_ERROR(
+      TRITONBACKEND_RequestId(request, &request_id),
+      "unable to retrieve request ID string");
+  if ((request_id == nullptr) || (request_id[0] == '\0')) {
+    request_id = "<id_unknown>";
+  }
+  return std::string("[request id: ") + request_id + "] ";
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_input_collector.cc
+++ b/3rdparty/backend-r22.12/src/backend_input_collector.cc
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_input_collector.h"
+
+#include <atomic>
+#include "triton/backend/backend_common.h"
+#ifdef TRITON_ENABLE_GPU
+#include "kernel.h"
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+//
+// BackendInputCollector::InputIterator
+//
+
+BackendInputCollector::InputIterator::InputIterator(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
+    const char* host_policy_name, const bool coalesce_request_input)
+    : requests_(requests), request_count_(request_count), responses_(responses),
+      input_name_(input_name), host_policy_(host_policy_name),
+      coalesce_request_input_(coalesce_request_input), curr_request_idx_(0),
+      curr_buffer_idx_(0), reach_end_(false)
+{
+  auto& response = (*responses_)[curr_request_idx_];
+  RESPOND_AND_SET_NULL_IF_ERROR(
+      &response, TRITONBACKEND_RequestInput(
+                     requests_[curr_request_idx_], input_name_, &curr_input_));
+  RESPOND_AND_SET_NULL_IF_ERROR(
+      &response, TRITONBACKEND_InputPropertiesForHostPolicy(
+                     curr_input_, host_policy_, nullptr, nullptr, nullptr,
+                     nullptr, nullptr, &curr_buffer_cnt_));
+}
+
+bool
+BackendInputCollector::InputIterator::GetNextContiguousInput(
+    ContiguousBuffer* input)
+{
+  if (reach_end_ || (curr_buffer_idx_ >= curr_buffer_cnt_)) {
+    return false;
+  }
+
+  // Get the first buffer
+  TRITONBACKEND_InputBufferForHostPolicy(
+      curr_input_, host_policy_, curr_buffer_idx_,
+      reinterpret_cast<const void**>(&input->memory_desc_.buffer_),
+      &input->memory_desc_.byte_size_, &input->memory_desc_.memory_type_,
+      &input->memory_desc_.memory_type_id_);
+  ++curr_buffer_idx_;
+  input->start_request_idx_ = curr_request_idx_;
+  input->end_request_idx_ = curr_request_idx_;
+  if (!coalesce_request_input_) {
+    if (curr_buffer_idx_ >= curr_buffer_cnt_) {
+      ++curr_request_idx_;
+      if (curr_request_idx_ < request_count_) {
+        auto& response = (*responses_)[curr_request_idx_];
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &response,
+            TRITONBACKEND_RequestInput(
+                requests_[curr_request_idx_], input_name_, &curr_input_));
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            &response, TRITONBACKEND_InputPropertiesForHostPolicy(
+                           curr_input_, host_policy_, nullptr, nullptr, nullptr,
+                           nullptr, nullptr, &curr_buffer_cnt_));
+        // reset buffer idx
+        curr_buffer_idx_ = 0;
+      } else {
+        reach_end_ = true;
+      }
+    }
+    return true;
+  }
+
+  do {
+    for (; curr_buffer_idx_ < curr_buffer_cnt_; ++curr_buffer_idx_) {
+      const void* next_buffer;
+      size_t next_buffer_byte_size;
+      TRITONSERVER_MemoryType next_memory_type;
+      int64_t next_memory_type_id;
+      TRITONBACKEND_InputBufferForHostPolicy(
+          curr_input_, host_policy_, curr_buffer_idx_, &next_buffer,
+          &next_buffer_byte_size, &next_memory_type, &next_memory_type_id);
+      if (((input->memory_desc_.buffer_ + input->memory_desc_.byte_size_) !=
+           next_buffer) ||
+          (input->memory_desc_.memory_type_ != next_memory_type) ||
+          (input->memory_desc_.memory_type_id_ != next_memory_type_id)) {
+        return true;
+      }
+      input->memory_desc_.byte_size_ += next_buffer_byte_size;
+      input->end_request_idx_ = curr_request_idx_;
+    }
+    // Iterated all buffers for current request, check next
+    ++curr_request_idx_;
+    if (curr_request_idx_ < request_count_) {
+      auto& response = (*responses_)[curr_request_idx_];
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response,
+          TRITONBACKEND_RequestInput(
+              requests_[curr_request_idx_], input_name_, &curr_input_));
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response, TRITONBACKEND_InputPropertiesForHostPolicy(
+                         curr_input_, host_policy_, nullptr, nullptr, nullptr,
+                         nullptr, nullptr, &curr_buffer_cnt_));
+      // reset buffer idx
+      curr_buffer_idx_ = 0;
+    }
+  } while (curr_request_idx_ < request_count_);
+  reach_end_ = true;
+  return true;
+}
+
+//
+// BackendInputCollector
+//
+
+bool
+BackendInputCollector::GetInputBufferIfContiguous(
+    const char* input_name, const char** buffer, size_t* buffer_byte_size,
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  *buffer = nullptr;
+  *buffer_byte_size = 0;
+  const char* expected_next_buffer = nullptr;
+  bool contiguous = true;
+  for (size_t idx = 0; idx < request_count_; idx++) {
+    auto& request = requests_[idx];
+    auto& response = (*responses_)[idx];
+
+    TRITONBACKEND_Input* input;
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        &response, TRITONBACKEND_RequestInput(request, input_name, &input));
+    uint64_t byte_size;
+    uint32_t buffer_count;
+    RESPOND_AND_SET_NULL_IF_ERROR(
+        &response, TRITONBACKEND_InputPropertiesForHostPolicy(
+                       input, host_policy_cstr_, nullptr, nullptr, nullptr,
+                       nullptr, &byte_size, &buffer_count));
+    for (size_t idx = 0; idx < buffer_count; ++idx) {
+      const void* src_buffer;
+      size_t src_byte_size;
+      TRITONSERVER_MemoryType src_memory_type;
+      int64_t src_memory_type_id;
+
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response,
+          TRITONBACKEND_InputBufferForHostPolicy(
+              input, host_policy_cstr_, idx, &src_buffer, &src_byte_size,
+              &src_memory_type, &src_memory_type_id));
+      if (*buffer != nullptr) {
+        // If have seen the second buffer while coalescing input is not
+        // requested, treat the inputs are not contiguous
+        if (coalesce_request_input_ && (expected_next_buffer == src_buffer) &&
+            (*memory_type == src_memory_type) &&
+            (*memory_type_id == src_memory_type_id)) {
+          expected_next_buffer += src_byte_size;
+        } else {
+          contiguous = false;
+        }
+        // Want to know total buffer byte size even if it is not contiguous
+        *buffer_byte_size += src_byte_size;
+      } else {
+        *buffer = reinterpret_cast<const char*>(src_buffer);
+        *memory_type = src_memory_type;
+        *memory_type_id = src_memory_type_id;
+        *buffer_byte_size = src_byte_size;
+        expected_next_buffer = *buffer + src_byte_size;
+      }
+    }
+  }
+  return contiguous;
+}
+
+void
+BackendInputCollector::ProcessTensor(
+    const char* input_name, char* buffer, const size_t buffer_byte_size,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+{
+  // A value of CPU_PINNED indicates that pinned memory buffer is not
+  // needed for this tensor. Any other value indicates that a pinned
+  // memory buffer is needed when the target memory type matches
+  // 'use_pinned_memory_type'.
+  TRITONSERVER_MemoryType use_pinned_memory_type =
+      TRITONSERVER_MEMORY_CPU_PINNED;
+  if (pinned_enabled_) {
+    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
+  }
+  const bool use_kernel = (kernel_buffer_threshold_ != 0);
+
+  size_t buffer_offset = 0;
+
+  InputIterator ii(
+      requests_, request_count_, responses_, input_name, host_policy_cstr_,
+      coalesce_request_input_);
+  ContiguousBuffer input;
+  while (ii.GetNextContiguousInput(&input)) {
+    // If there are pending copies from tensor buffer that is not
+    // contiguous with 'response's part of that buffer, then need to
+    // go ahead and perform the pending copies so that can start a new
+    // contiguous region if necessary.
+    if ((pending_pinned_byte_size_ > 0) &&
+        (buffer_offset !=
+         (pending_pinned_byte_size_ + pending_pinned_offset_))) {
+      need_sync_ |= FlushPendingPinned(
+          buffer, buffer_byte_size, memory_type, memory_type_id);
+    }
+    if ((pending_copy_kernel_buffer_byte_size_ > 0) &&
+        (buffer_offset != (pending_copy_kernel_buffer_byte_size_ +
+                           pending_copy_kernel_buffer_offset_))) {
+      need_sync_ |= FlushPendingCopyKernel(
+          buffer, buffer_byte_size, memory_type, memory_type_id);
+    }
+
+    need_sync_ |= SetInputTensor(
+        input_name, input, buffer, buffer_byte_size, memory_type,
+        memory_type_id, buffer_offset, use_pinned_memory_type, use_kernel,
+        true);
+
+    buffer_offset += input.memory_desc_.byte_size_;
+  }
+
+  // Done with the tensor, flush any pending pinned copies.
+  need_sync_ |=
+      FlushPendingPinned(buffer, buffer_byte_size, memory_type, memory_type_id);
+  need_sync_ |= FlushPendingCopyKernel(
+      buffer, buffer_byte_size, memory_type, memory_type_id);
+#ifdef TRITON_ENABLE_GPU
+  if (need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+TRITONSERVER_Error*
+BackendInputCollector::ProcessTensor(
+    const char* input_name, char* buffer, const size_t buffer_byte_size,
+    const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
+        allowed_input_types,
+    const char** dst_buffer, size_t* dst_buffer_byte_size,
+    TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)
+{
+  if (buffer == nullptr) {
+    if (allowed_input_types.size() == 0) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "'allowed_input_types' must contain at least one pair of memory type "
+          "and id");
+    }
+    if (GetInputBufferIfContiguous(
+            input_name, dst_buffer, dst_buffer_byte_size, dst_memory_type,
+            dst_memory_type_id)) {
+      // zero size buffer will be treated as contiguous as well,
+      // but we want to invoke backend memory to have a valid address.
+      if (*dst_buffer_byte_size != 0) {
+        // If the buffer is contiguous, check if the caller expects its type
+        for (const auto& allowed_type : allowed_input_types) {
+          if ((*dst_memory_type == allowed_type.first) &&
+              ((*dst_memory_type_id == allowed_type.second))) {
+            return nullptr;  // success
+          }
+        }
+      }
+    }
+    // A separate buffer is needed
+    BackendMemory* backend_memory = nullptr;
+    for (const auto& allowed_type : allowed_input_types) {
+      std::vector<BackendMemory::AllocationType> alloc_types;
+      const int64_t memory_type_id = allowed_type.second;
+      switch (allowed_type.first) {
+        case TRITONSERVER_MEMORY_GPU:
+          alloc_types = {BackendMemory::AllocationType::GPU_POOL,
+                         BackendMemory::AllocationType::GPU};
+          break;
+        case TRITONSERVER_MEMORY_CPU_PINNED:
+          alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
+                         BackendMemory::AllocationType::CPU_PINNED};
+          break;
+        case TRITONSERVER_MEMORY_CPU:
+          alloc_types = {BackendMemory::AllocationType::CPU};
+          break;
+      }
+      auto err = BackendMemory::Create(
+          memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,
+          &backend_memory);
+      if (err != nullptr) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("unable to create backend memory for type: ") +
+             TRITONSERVER_MemoryTypeString(allowed_type.first) +
+             " id: " + std::to_string(memory_type_id) + ": " +
+             TRITONSERVER_ErrorMessage(err))
+                .c_str());
+        TRITONSERVER_ErrorDelete(err);
+      } else {
+        in_use_memories_.emplace_back(backend_memory);
+        break;
+      }
+    }
+    if (backend_memory == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("failed to allocate contiguous buffer for input '") +
+           input_name + "'")
+              .c_str());
+    }
+    buffer = backend_memory->MemoryPtr();
+    *dst_buffer = backend_memory->MemoryPtr();
+    *dst_buffer_byte_size = backend_memory->ByteSize();
+    *dst_memory_type = backend_memory->MemoryType();
+    *dst_memory_type_id = backend_memory->MemoryTypeId();
+  } else {
+    if (allowed_input_types.size() != 1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "'allowed_input_types' must only contain the memory type and id of "
+          "'buffer'");
+    }
+    *dst_buffer = buffer;
+    *dst_buffer_byte_size = buffer_byte_size;
+    *dst_memory_type = allowed_input_types[0].first;
+    *dst_memory_type_id = allowed_input_types[0].second;
+  }
+  if (*dst_buffer_byte_size != 0) {
+    ProcessTensor(
+        input_name, buffer, *dst_buffer_byte_size, *dst_memory_type,
+        *dst_memory_type_id);
+  }
+  return nullptr;  // success
+}
+
+bool
+BackendInputCollector::Finalize()
+{
+#ifdef TRITON_ENABLE_GPU
+  if ((!deferred_pinned_.empty()) && need_sync_) {
+    if (event_ != nullptr) {
+      cudaEventSynchronize(event_);
+    } else {
+      cudaStreamSynchronize(stream_);
+    }
+    need_sync_ = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  // After the above sync all the GPU->pinned copies are complete. Any
+  // deferred copies of pinned->CPU can now be done.
+#ifdef TRITON_ENABLE_GPU
+  if (buffer_ready_event_ != nullptr) {
+    cudaEventSynchronize(buffer_ready_event_);
+    buffer_ready_event_ = nullptr;
+  }
+#endif  // TRITON_ENABLE_GPU
+  for (auto& def : deferred_pinned_) {
+    if (!def.finalized_) {
+      need_sync_ |= def.Finalize(stream_);
+    }
+  }
+  for (size_t i = 0; i < async_task_count_; i++) {
+    need_sync_ |= completion_queue_.Get();
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  // Record the new event location if deferred copies occur
+  if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  return need_sync_;
+}
+
+bool
+BackendInputCollector::DeferredPinned::Finalize(cudaStream_t stream)
+{
+  bool cuda_used = false;
+  auto err = CopyBuffer(
+      "pinned buffer", TRITONSERVER_MEMORY_CPU_PINNED, 0, tensor_memory_type_,
+      tensor_memory_id_, pinned_memory_size_, pinned_memory_,
+      tensor_buffer_ + tensor_buffer_offset_, stream, &cuda_used);
+
+  // If something goes wrong with the copy all the pending
+  // responses fail...
+  if (err != nullptr) {
+    for (auto& pr : requests_) {
+      for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;
+           ++idx) {
+        if ((*responses_)[idx] != nullptr) {
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseSend(
+                  (*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                  err),
+              "failed to send error response");
+          (*responses_)[idx] = nullptr;
+        }
+      }
+    }
+    TRITONSERVER_ErrorDelete(err);
+  }
+  return cuda_used;
+}
+
+bool
+BackendInputCollector::SetInputTensor(
+    const char* input_name, const ContiguousBuffer& input, char* tensor_buffer,
+    const size_t tensor_buffer_byte_size,
+    const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,
+    const TRITONSERVER_MemoryType use_pinned_memory_type, const bool use_kernel,
+    const bool wait_buffer)
+{
+  bool cuda_copy = false;
+
+  if ((tensor_buffer_offset + input.memory_desc_.byte_size_) >
+      tensor_buffer_byte_size) {
+    for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;
+         ++i) {
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &(*responses_)[i],
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ERROR_INVALID_ARG,
+              std::string(
+                  "unexpected total byte size " +
+                  std::to_string(
+                      tensor_buffer_offset + input.memory_desc_.byte_size_) +
+                  " for input '" + input_name + "', expecting " +
+                  std::to_string(tensor_buffer_byte_size))
+                  .c_str()));
+    }
+    return cuda_copy;
+  }
+
+  // If the request buffer matches the memory type that should use an
+  // intermediate pinned memory buffer for the transfer, then just
+  // record the input as pending and increase the size required for
+  // the intermediate pinned buffer. We only do this check for the
+  // first buffer of an input and apply the same policy for all
+  // buffers. So if an inputs data is split over different memory
+  // types this may not be ideal but that should be a very rare
+  // situation.
+  if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&
+      (input.memory_desc_.memory_type_ == use_pinned_memory_type)) {
+    if (pending_pinned_byte_size_ == 0) {
+      pending_pinned_offset_ = tensor_buffer_offset;
+    }
+
+    pending_pinned_byte_size_ += input.memory_desc_.byte_size_;
+    pending_pinned_input_buffers_.push_back(input);
+    return cuda_copy;
+  }
+  // [FIXME] support other direction if prove to be faster, all kernel
+  // handling code in this class asssumes the destination buffer is on device
+  // If the request buffer and the destination buffer are accessible by all
+  // GPUs (i.e. pinned, device), initiate the copy via copy CUDA kernel.
+  // We only do this check for the
+  // first buffer of an input and apply the same policy for all
+  // buffers. So if an inputs data is split over different memory
+  // types this may not be ideal but that should be a very rare
+  // situation.
+  // Currently checked direction:
+  // pinned -> device
+  // same device -> device
+  // different device -> device
+  if (use_kernel &&
+      (input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_CPU) &&
+      (tensor_memory_type == TRITONSERVER_MEMORY_GPU)) {
+    // [FIXME] Currently not allowing copy between devices as it requires
+    // peer-to-peer access to be enabled. Peer-to-peer is enabled by default,
+    // but server can still runs even if it fails to enable peer-to-peer.
+    // Should provide a utility to check whether a device pair allows direct
+    // access and use gather kernel accordingly
+    if ((input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_GPU) ||
+        (input.memory_desc_.memory_type_id_ == tensor_memory_type_id)) {
+      if (pending_copy_kernel_buffer_byte_size_ == 0) {
+        pending_copy_kernel_buffer_offset_ = tensor_buffer_offset;
+      }
+
+      pending_copy_kernel_buffer_byte_size_ += input.memory_desc_.byte_size_;
+      ++pending_copy_kernel_input_buffer_counts_;
+      pending_copy_kernel_input_buffers_.push_back(input);
+      return cuda_copy;
+    }
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  if (wait_buffer && (buffer_ready_event_ != nullptr)) {
+    cudaEventSynchronize(buffer_ready_event_);
+    buffer_ready_event_ = nullptr;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  // Direct copy without intermediate pinned memory.
+  bool cuda_used = false;
+  auto err = CopyBuffer(
+      input_name, input.memory_desc_.memory_type_,
+      input.memory_desc_.memory_type_id_, tensor_memory_type,
+      tensor_memory_type_id, input.memory_desc_.byte_size_,
+      input.memory_desc_.buffer_, tensor_buffer + tensor_buffer_offset, stream_,
+      &cuda_used, copy_on_stream_);
+  if (err != nullptr) {
+    for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;
+         ++i) {
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &(*responses_)[i],
+          TRITONSERVER_ErrorNew(
+              TRITONSERVER_ErrorCode(err), TRITONSERVER_ErrorMessage(err)));
+    }
+    TRITONSERVER_ErrorDelete(err);
+  }
+  cuda_copy |= cuda_used;
+  return cuda_copy;
+}
+
+bool
+BackendInputCollector::FlushPendingPinned(
+    char* tensor_buffer, const size_t tensor_buffer_byte_size,
+    const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id)
+{
+  bool cuda_copy = false;
+
+  // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU
+
+  // Attempt to allocate a pinned buffer to use for staging the
+  // copy... if we fail to allocated the pinned buffer then we just
+  // directly go CPU->GPU or GPU->CPU.
+  char* pinned_memory = nullptr;
+  int64_t pinned_memory_type_id = 0;
+  TRITONSERVER_MemoryType pinned_memory_type;
+  BackendMemory* backend_memory;
+  if (pending_pinned_byte_size_ > 0) {
+    TRITONSERVER_Error* err = BackendMemory::Create(
+        memory_manager_,
+        {BackendMemory::AllocationType::CPU_PINNED_POOL,
+         BackendMemory::AllocationType::CPU_PINNED},
+        0 /* memory_type_id */, pending_pinned_byte_size_, &backend_memory);
+    if (err != nullptr) {
+      TRITONSERVER_ErrorDelete(err);
+    } else {
+      pinned_memory = backend_memory->MemoryPtr();
+      pinned_memory_type = backend_memory->MemoryType();
+      pinned_memory_type_id = backend_memory->MemoryTypeId();
+    }
+  }
+
+  // If the pinned buffer wasn't actually allocated then just perform
+  // a direct copy.
+  if (pinned_memory == nullptr) {
+    size_t offset = 0;
+    for (auto& pr : pending_pinned_input_buffers_) {
+      cuda_copy |= SetInputTensor(
+          "pinned fallback", pr, tensor_buffer, tensor_buffer_byte_size,
+          tensor_memory_type, tensor_memory_type_id,
+          pending_pinned_offset_ + offset, TRITONSERVER_MEMORY_CPU_PINNED,
+          false, true);
+      offset += pr.memory_desc_.byte_size_;
+    }
+  }
+  // We have a pinned buffer so copy the pending input buffer(s) into
+  // the pinned memory.
+  else {  // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED
+    bool cuda_used = false;
+    size_t offset = 0;
+    if (!use_async_cpu_copy_) {
+      for (auto& pr : pending_pinned_input_buffers_) {
+        cuda_used |= SetInputTensor(
+            "pinned H2H", pr, pinned_memory, pending_pinned_byte_size_,
+            TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, offset,
+            TRITONSERVER_MEMORY_CPU_PINNED, false, true);
+        offset += pr.memory_desc_.byte_size_;
+      }
+
+      cuda_copy |= cuda_used;
+
+      // If the copy was not async (i.e. if request input was in CPU so
+      // a CPU->CPU-PINNED copy was performed above), then the pinned
+      // buffer now holds the tensor contents and we can immediately
+      // issue the copies from the pinned buffer to the tensor.
+      //
+      // Otherwise the GPU->CPU-PINNED async copies are in flight and we
+      // simply remember the pinned buffer and the corresponding
+      // request inputs so that we can do the pinned->CPU copies in
+      // finalize after we have waited for all async copies to complete.
+      if (!cuda_used) {
+#ifdef TRITON_ENABLE_GPU
+        if (buffer_ready_event_ != nullptr) {
+          cudaEventSynchronize(buffer_ready_event_);
+          buffer_ready_event_ = nullptr;
+        }
+#endif  // TRITON_ENABLE_GPU
+        auto err = CopyBuffer(
+            "pinned input buffer H2D", TRITONSERVER_MEMORY_CPU_PINNED,
+            0 /* memory_type_id */, tensor_memory_type, tensor_memory_type_id,
+            pending_pinned_byte_size_, pinned_memory,
+            tensor_buffer + pending_pinned_offset_, stream_, &cuda_used,
+            copy_on_stream_);
+        cuda_copy |= cuda_used;
+
+        // If something goes wrong with the copy all the pending
+        // responses fail...
+        if (err != nullptr) {
+          for (auto& pr : pending_pinned_input_buffers_) {
+            for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;
+                 ++idx) {
+              if ((*responses_)[idx] != nullptr) {
+                LOG_IF_ERROR(
+                    TRITONBACKEND_ResponseSend(
+                        (*responses_)[idx],
+                        TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+                    "failed to send error response");
+                (*responses_)[idx] = nullptr;
+              }
+            }
+          }
+          TRITONSERVER_ErrorDelete(err);
+        }
+      } else {  // cuda_used
+        deferred_pinned_.emplace_back(
+            pinned_memory, pending_pinned_byte_size_, tensor_buffer,
+            pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,
+            std::move(pending_pinned_input_buffers_), responses_);
+      }
+    } else {
+      async_task_count_++;
+      deferred_pinned_.emplace_back(
+          pinned_memory, pending_pinned_byte_size_, tensor_buffer,
+          pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,
+          std::move(pending_pinned_input_buffers_), responses_);
+      auto& deferred_pinned = deferred_pinned_.back();
+      // Mark finalized to avoid duplicated call to DeferredPinned::Finalized()
+      // in BackendInputCollector::Finalize()
+      deferred_pinned_.back().finalized_ = true;
+      auto incomplete_count = new std::atomic<size_t>(std::min(
+          deferred_pinned_.back().requests_.size(),
+          triton::common::AsyncWorkQueue::WorkerCount()));
+      auto pending_pinned_byte_size = pending_pinned_byte_size_;
+      size_t stride = (deferred_pinned_.back().requests_.size() +
+                       triton::common::AsyncWorkQueue::WorkerCount() - 1) /
+                      triton::common::AsyncWorkQueue::WorkerCount();
+      auto pending_it = deferred_pinned_.back().requests_.begin();
+      while (pending_it != deferred_pinned_.back().requests_.end()) {
+        auto end_it = pending_it;
+        auto next_offset = offset;
+        for (size_t idx = 0; idx < stride; idx++) {
+          next_offset += end_it->memory_desc_.byte_size_;
+          end_it++;
+          if (end_it == deferred_pinned_.back().requests_.end()) {
+            break;
+          }
+        }
+
+        auto err =
+            CommonErrorToTritonError(triton::common::AsyncWorkQueue::AddTask(
+                [this, offset, pinned_memory, pinned_memory_type,
+                 pending_pinned_byte_size, pinned_memory_type_id, pending_it,
+                 end_it, incomplete_count, &deferred_pinned]() mutable {
+                  for (; pending_it != end_it; pending_it++) {
+                    SetInputTensor(
+                        "pinned async H2H", *pending_it, pinned_memory,
+                        pending_pinned_byte_size, pinned_memory_type,
+                        pinned_memory_type_id, offset,
+                        TRITONSERVER_MEMORY_CPU_PINNED, false, false);
+                    offset += pending_it->memory_desc_.byte_size_;
+                  }
+                  // The last segmented task will start the next phase of
+                  // the internal pinned buffer copy
+                  if (incomplete_count->fetch_sub(1) == 1) {
+#ifdef TRITON_ENABLE_GPU
+                    if (buffer_ready_event_ != nullptr) {
+                      cudaEventSynchronize(buffer_ready_event_);
+                      buffer_ready_event_ = nullptr;
+                    }
+#endif  // TRITON_ENABLE_GPU
+                    completion_queue_.Put(deferred_pinned.Finalize(stream_));
+                    delete incomplete_count;
+                  }
+                }));
+        if (err != nullptr) {
+          for (; pending_it != end_it; pending_it++) {
+            for (size_t idx = pending_it->start_request_idx_;
+                 idx <= pending_it->end_request_idx_; ++idx) {
+              if ((*responses_)[idx] != nullptr) {
+                LOG_IF_ERROR(
+                    TRITONBACKEND_ResponseSend(
+                        (*responses_)[idx],
+                        TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+                    "failed to send error response");
+                (*responses_)[idx] = nullptr;
+              }
+            }
+          }
+        }
+        TRITONSERVER_ErrorDelete(err);
+
+        offset = next_offset;
+        pending_it = end_it;
+      }
+    }
+  }
+
+  // Pending pinned copies are handled...
+  pending_pinned_byte_size_ = 0;
+  pending_pinned_offset_ = 0;
+  pending_pinned_input_buffers_.clear();
+
+  // Need to hold on to the allocated pinned buffer as there are still
+  // copies in flight. Will delete it in finalize.
+  if (pinned_memory != nullptr) {
+    in_use_memories_.emplace_back(backend_memory);
+  }
+
+  return cuda_copy;
+}
+
+TRITONSERVER_Error*
+BackendInputCollector::BatchInputShape(
+    const BatchInput& batch_input, std::vector<int64_t>* shape)
+{
+  *shape = std::vector<int64_t>{0};
+  switch (batch_input.BatchInputKind()) {
+    case BatchInput::Kind::BATCH_ELEMENT_COUNT:
+    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {
+      (*shape)[0] = request_count_;
+      break;
+    }
+    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {
+      (*shape)[0] = request_count_ + 1;
+      break;
+    }
+    case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {
+      const auto& source_input = batch_input.SourceInputs()[0];
+      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+        TRITONBACKEND_Input* input;
+        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+            requests_[req_idx], source_input.c_str(), &input));
+        const int64_t* shape_arr;
+        uint32_t dims_count;
+        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
+            nullptr, nullptr));
+        (*shape)[0] =
+            std::max((*shape)[0], GetElementCount(shape_arr, dims_count));
+      }
+      break;
+    }
+    case BatchInput::Kind::BATCH_ITEM_SHAPE: {
+      shape->emplace_back(0);
+      const auto& source_input = batch_input.SourceInputs()[0];
+      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+        TRITONBACKEND_Input* input;
+        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+            requests_[req_idx], source_input.c_str(), &input));
+        const int64_t* shape_arr;
+        uint32_t dims_count;
+        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
+            nullptr, nullptr));
+        // Assuming first dimension is batch size and ragged input is only set
+        // for batching enabled model.
+        (*shape)[0] += shape_arr[0];
+        // The batch input tracks the shape without batch dimension for
+        // each batch item
+        (*shape)[1] = (dims_count - 1);
+      }
+      break;
+    }
+    case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {
+      const auto& source_input = batch_input.SourceInputs()[0];
+      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+        TRITONBACKEND_Input* input;
+        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+            requests_[req_idx], source_input.c_str(), &input));
+        const int64_t* shape_arr;
+        uint32_t dims_count;
+        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,
+            nullptr, nullptr));
+        // Assuming first dimension is batch size and ragged input is only set
+        // for batching enabled model.
+        // The batch input tracks the shape without batch dimension for
+        // each batch item
+        (*shape)[0] += (shape_arr[0] * (dims_count - 1));
+      }
+      break;
+    }
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "unsupported BatchInputKind received");
+  }
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+BackendInputCollector::ProcessBatchInput(
+    const BatchInput& batch_input, char* buffer, const size_t buffer_byte_size,
+    const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
+        allowed_input_types,
+    const char** dst_buffer, size_t* dst_buffer_byte_size,
+    TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  if (buffer_ready_event_ != nullptr) {
+    cudaEventSynchronize(buffer_ready_event_);
+    buffer_ready_event_ = nullptr;
+  }
+#endif  // TRITON_ENABLE_GPU
+  if (buffer == nullptr) {
+    if (allowed_input_types.size() == 0) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "'allowed_input_types' must contain at least one pair of memory type "
+          "and id");
+    }
+    // Calculate the byte size of the buffer
+    std::vector<int64_t> shape;
+    RETURN_IF_ERROR(BatchInputShape(batch_input, &shape));
+    *dst_buffer_byte_size = GetByteSize(batch_input.DataType(), shape);
+    BackendMemory* backend_memory = nullptr;
+    for (const auto& allowed_type : allowed_input_types) {
+      std::vector<BackendMemory::AllocationType> alloc_types;
+      const int64_t memory_type_id = allowed_type.second;
+      switch (allowed_type.first) {
+        case TRITONSERVER_MEMORY_GPU:
+          alloc_types = {BackendMemory::AllocationType::GPU_POOL,
+                         BackendMemory::AllocationType::GPU};
+          break;
+        case TRITONSERVER_MEMORY_CPU_PINNED:
+          alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
+                         BackendMemory::AllocationType::CPU_PINNED};
+          break;
+        case TRITONSERVER_MEMORY_CPU:
+          alloc_types = {BackendMemory::AllocationType::CPU};
+          break;
+      }
+      auto err = BackendMemory::Create(
+          memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,
+          &backend_memory);
+      if (err != nullptr) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("unable to create backend memory for type: ") +
+             TRITONSERVER_MemoryTypeString(allowed_type.first) +
+             " id: " + std::to_string(memory_type_id) + ": " +
+             TRITONSERVER_ErrorMessage(err))
+                .c_str());
+        TRITONSERVER_ErrorDelete(err);
+      } else {
+        in_use_memories_.emplace_back(backend_memory);
+        break;
+      }
+    }
+    if (backend_memory == nullptr) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string(
+               "failed to allocate contiguous buffer for batch input '") +
+           batch_input.TargetNames()[0] + "'")
+              .c_str());
+    }
+    buffer = backend_memory->MemoryPtr();
+    *dst_buffer = backend_memory->MemoryPtr();
+    *dst_buffer_byte_size = backend_memory->ByteSize();
+    *dst_memory_type = backend_memory->MemoryType();
+    *dst_memory_type_id = backend_memory->MemoryTypeId();
+  } else {
+    if (allowed_input_types.size() != 1) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          "'allowed_input_types' must only contain the memory type and id of "
+          "'buffer'");
+    }
+    *dst_buffer = buffer;
+    *dst_buffer_byte_size = buffer_byte_size;
+    *dst_memory_type = allowed_input_types[0].first;
+    *dst_memory_type_id = allowed_input_types[0].second;
+  }
+
+  char* input_buffer = buffer;
+  std::unique_ptr<BackendMemory> internal_buffer;
+  // Need a CPU buffer for modifying the value
+  if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {
+    BackendMemory* ib = nullptr;
+    RETURN_IF_ERROR(BackendMemory::Create(
+        memory_manager_,
+        {BackendMemory::AllocationType::CPU_PINNED_POOL,
+         BackendMemory::AllocationType::CPU},
+        0, *dst_buffer_byte_size, &ib));
+    internal_buffer.reset(ib);
+    input_buffer = internal_buffer->MemoryPtr();
+  }
+  const auto& data_type = batch_input.DataType();
+  switch (batch_input.BatchInputKind()) {
+    case BatchInput::Kind::BATCH_ELEMENT_COUNT: {
+      const auto& source_input = batch_input.SourceInputs()[0];
+      if (data_type == TRITONSERVER_TYPE_FP32) {
+        RETURN_IF_ERROR(SetElementCount<float>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      } else {
+        RETURN_IF_ERROR(SetElementCount<int32_t>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      }
+      break;
+    }
+    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {
+      const auto& source_input = batch_input.SourceInputs()[0];
+      if (data_type == TRITONSERVER_TYPE_FP32) {
+        RETURN_IF_ERROR(SetAccumulatedElementCount<float>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      } else {
+        RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      }
+      break;
+    }
+    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {
+      const auto& source_input = batch_input.SourceInputs()[0];
+      if (data_type == TRITONSERVER_TYPE_FP32) {
+        *reinterpret_cast<float*>(input_buffer) = 0;
+        RETURN_IF_ERROR(SetAccumulatedElementCount<float>(
+            source_input, input_buffer + sizeof(float),
+            *dst_buffer_byte_size - sizeof(float)));
+      } else {
+        *reinterpret_cast<int32_t*>(input_buffer) = 0;
+        RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(
+            source_input, input_buffer + sizeof(int32_t),
+            *dst_buffer_byte_size - sizeof(int32_t)));
+      }
+      break;
+    }
+    case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {
+      // The batch input is described by the shape,
+      // no data modification is needed
+      return nullptr;  // success
+    }
+    case BatchInput::Kind::BATCH_ITEM_SHAPE:
+    case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {
+      // Use the same utilities for both types as the data will be the same,
+      // only difference is the shape of the tensor.
+      const auto& source_input = batch_input.SourceInputs()[0];
+      if (data_type == TRITONSERVER_TYPE_FP32) {
+        *reinterpret_cast<float*>(input_buffer) = 0;
+        RETURN_IF_ERROR(SetBatchItemShape<float>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      } else {
+        *reinterpret_cast<int32_t*>(input_buffer) = 0;
+        RETURN_IF_ERROR(SetBatchItemShape<int32_t>(
+            source_input, input_buffer, *dst_buffer_byte_size));
+      }
+      break;
+    }
+  }
+  if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {
+    bool cuda_used;
+    RETURN_IF_ERROR(CopyBuffer(
+        "batch input buffer", internal_buffer->MemoryType(),
+        internal_buffer->MemoryTypeId(), *dst_memory_type, *dst_memory_type_id,
+        *dst_buffer_byte_size, input_buffer, buffer, stream_, &cuda_used,
+        copy_on_stream_));
+    // Need to keep the backend memory alive in the case of async copy
+    in_use_memories_.emplace_back(std::move(internal_buffer));
+    need_sync_ |= cuda_used;
+  }
+  return nullptr;  // success
+}
+
+template <typename T>
+TRITONSERVER_Error*
+BackendInputCollector::SetElementCount(
+    const std::string& source_input, char* buffer,
+    const size_t buffer_byte_size)
+{
+  size_t buffer_offset = 0;
+  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+    if (buffer_offset + sizeof(T) > buffer_byte_size) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          "unexpected total byte size for batch input");
+    }
+
+    TRITONBACKEND_Input* input;
+    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+        requests_[req_idx], source_input.c_str(), &input));
+    const int64_t* shape;
+    uint32_t dims_count;
+    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
+        nullptr, nullptr));
+    *(reinterpret_cast<T*>(buffer) + req_idx) =
+        GetElementCount(shape, dims_count);
+    buffer_offset += sizeof(T);
+  }
+  // Set the rest of the buffer to 0
+  for (; buffer_offset + sizeof(T) <= buffer_byte_size;
+       buffer_offset += sizeof(T)) {
+    *reinterpret_cast<T*>(buffer + buffer_offset) = 0;
+  }
+  return nullptr;  // success
+}
+
+template <typename T>
+TRITONSERVER_Error*
+BackendInputCollector::SetAccumulatedElementCount(
+    const std::string& source_input, char* buffer,
+    const size_t buffer_byte_size)
+{
+  size_t accumulated_element_count = 0;
+  size_t buffer_offset = 0;
+  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+    if (buffer_offset + sizeof(T) > buffer_byte_size) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          "unexpected total byte size for batch input");
+    }
+
+    TRITONBACKEND_Input* input;
+    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+        requests_[req_idx], source_input.c_str(), &input));
+    const int64_t* shape;
+    uint32_t dims_count;
+    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
+        nullptr, nullptr));
+    accumulated_element_count += GetElementCount(shape, dims_count);
+    *(reinterpret_cast<T*>(buffer) + req_idx) = accumulated_element_count;
+    buffer_offset += sizeof(T);
+  }
+  // Set the rest of the buffer to 'accumulated_element_count'
+  // (no increase in element count)
+  for (; buffer_offset + sizeof(T) <= buffer_byte_size;
+       buffer_offset += sizeof(T)) {
+    *reinterpret_cast<T*>(buffer + buffer_offset) = accumulated_element_count;
+  }
+  return nullptr;  // success
+}
+
+template <typename T>
+TRITONSERVER_Error*
+BackendInputCollector::SetBatchItemShape(
+    const std::string& source_input, char* buffer,
+    const size_t buffer_byte_size)
+{
+  size_t buffer_offset = 0;
+  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {
+    TRITONBACKEND_Input* input;
+    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
+        requests_[req_idx], source_input.c_str(), &input));
+    const int64_t* shape;
+    uint32_t dims_count;
+    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(
+        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,
+        nullptr, nullptr));
+    // Assuming first dimension is batch size and ragged input is only set
+    // for batching enabled model.
+    size_t batch_1_size = sizeof(T) * (dims_count - 1);
+    if (buffer_offset + (size_t)shape[0] * batch_1_size > buffer_byte_size) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INVALID_ARG,
+          (GetRequestId(requests_[req_idx]) +
+           "unexpected total byte size for batch input")
+              .c_str());
+    }
+    // The batch input tracks the shape without batch dimension for
+    // each batch item
+    for (size_t idx = 1; idx < dims_count; ++idx) {
+      // Need to set the element explicitly for type conversion
+      *(reinterpret_cast<T*>(buffer + buffer_offset) + (idx - 1)) = shape[idx];
+    }
+    // memcpy the data repeatedly if the request has batch size > 1
+    for (int64_t idx = 1; idx < shape[0]; ++idx) {
+      memcpy(
+          buffer + buffer_offset + idx * batch_1_size, buffer + buffer_offset,
+          batch_1_size);
+    }
+    buffer_offset += batch_1_size * (size_t)shape[0];
+  }
+  return nullptr;  // success
+}
+
+bool
+BackendInputCollector::FlushPendingCopyKernel(
+    char* tensor_buffer, const size_t tensor_buffer_byte_size,
+    const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id)
+{
+  if (pending_copy_kernel_input_buffers_.size() == 0) {
+    return false;
+  }
+
+  bool cuda_copy = false;
+  TRITONSERVER_Error* error = nullptr;
+  // Only try to launch kernel if buffer count is large enough for
+  // good GPU utilization
+  if (pending_copy_kernel_input_buffer_counts_ >= kernel_buffer_threshold_) {
+    error = LaunchCopyKernel(
+        tensor_buffer, tensor_buffer_byte_size, tensor_memory_type,
+        tensor_memory_type_id);
+    cuda_copy = (error == nullptr);
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("gather kernel launched with status: ") +
+         ((error == nullptr) ? "Success" : TRITONSERVER_ErrorMessage(error)))
+            .c_str());
+  }
+  // If kernel can't be launched then just perform a direct copy.
+  if ((pending_copy_kernel_input_buffer_counts_ < kernel_buffer_threshold_) ||
+      (error != nullptr)) {
+    size_t offset = 0;
+    for (auto& pr : pending_copy_kernel_input_buffers_) {
+      cuda_copy |= SetInputTensor(
+          "gather kernel fallback", pr, tensor_buffer, tensor_buffer_byte_size,
+          tensor_memory_type, tensor_memory_type_id,
+          pending_copy_kernel_buffer_offset_ + offset,
+          TRITONSERVER_MEMORY_CPU_PINNED, false, true);
+      offset += pr.memory_desc_.byte_size_;
+    }
+  }
+  TRITONSERVER_ErrorDelete(error);
+
+  // Pending kernel copies are handled...
+  pending_copy_kernel_buffer_byte_size_ = 0;
+  pending_copy_kernel_buffer_offset_ = 0;
+  pending_copy_kernel_input_buffer_counts_ = 0;
+  pending_copy_kernel_input_buffers_.clear();
+
+  return cuda_copy;
+}
+
+TRITONSERVER_Error*
+BackendInputCollector::LaunchCopyKernel(
+    char* tensor_buffer, const size_t tensor_buffer_byte_size,
+    const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id)
+{
+#ifdef TRITON_ENABLE_GPU
+  input_ptr_buffer_host_.emplace_back(new std::vector<int8_t*>());
+  byte_size_buffer_host_.emplace_back(new std::vector<size_t>());
+  byte_size_offset_buffer_host_.emplace_back(new std::vector<size_t>());
+
+  auto& input_ptr_buffer_host = *input_ptr_buffer_host_.back();
+  auto& byte_size_buffer_host = *byte_size_buffer_host_.back();
+  auto& byte_size_offset_buffer_host = *byte_size_offset_buffer_host_.back();
+
+  input_ptr_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);
+  byte_size_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);
+  byte_size_offset_buffer_host.reserve(
+      pending_copy_kernel_input_buffer_counts_);
+
+  size_t byte_size_offset = 0;
+  for (const auto& response_input : pending_copy_kernel_input_buffers_) {
+    const auto& input = response_input.memory_desc_;
+    input_ptr_buffer_host.emplace_back(
+        const_cast<int8_t*>(reinterpret_cast<const int8_t*>(input.buffer_)));
+    byte_size_buffer_host.emplace_back(input.byte_size_);
+    byte_size_offset_buffer_host.emplace_back(byte_size_offset);
+    byte_size_offset += input.byte_size_;
+  }
+
+  BackendMemory* backend_memory = nullptr;
+  std::vector<BackendMemory::AllocationType> alloc_types;
+  switch (tensor_memory_type) {
+    case TRITONSERVER_MEMORY_GPU:
+      alloc_types = {BackendMemory::AllocationType::GPU_POOL,
+                     BackendMemory::AllocationType::GPU};
+      break;
+    case TRITONSERVER_MEMORY_CPU_PINNED:
+      alloc_types = {BackendMemory::AllocationType::CPU_PINNED_POOL,
+                     BackendMemory::AllocationType::CPU_PINNED};
+      break;
+    case TRITONSERVER_MEMORY_CPU:
+      alloc_types = {BackendMemory::AllocationType::CPU};
+      break;
+  }
+
+  // input_ptr_buffer
+  size_t input_ptr_buffer_byte_size =
+      pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*);
+  auto err = BackendMemory::Create(
+      memory_manager_, alloc_types, tensor_memory_type_id,
+      input_ptr_buffer_byte_size, &backend_memory);
+  if (err != nullptr) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("unable to create backend memory for type: ") +
+         TRITONSERVER_MemoryTypeString(tensor_memory_type) +
+         " id: " + std::to_string(tensor_memory_type_id) + ": " +
+         TRITONSERVER_ErrorMessage(err))
+            .c_str());
+    TRITONSERVER_ErrorDelete(err);
+  } else {
+    in_use_memories_.emplace_back(backend_memory);
+  }
+  if (backend_memory == nullptr ||
+      (backend_memory->MemoryType() != tensor_memory_type) ||
+      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Failed to obtain memory buffer for copy kernel input");
+  }
+  char* input_ptr_buffer = backend_memory->MemoryPtr();
+
+  // byte_size_buffer
+  size_t byte_size_buffer_byte_size =
+      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);
+  err = BackendMemory::Create(
+      memory_manager_, alloc_types, tensor_memory_type_id,
+      byte_size_buffer_byte_size, &backend_memory);
+  if (err != nullptr) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("unable to create backend memory for type: ") +
+         TRITONSERVER_MemoryTypeString(tensor_memory_type) +
+         " id: " + std::to_string(tensor_memory_type_id) + ": " +
+         TRITONSERVER_ErrorMessage(err))
+            .c_str());
+    TRITONSERVER_ErrorDelete(err);
+  } else {
+    in_use_memories_.emplace_back(backend_memory);
+  }
+  if (backend_memory == nullptr ||
+      (backend_memory->MemoryType() != tensor_memory_type) ||
+      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Failed to obtain memory buffer for copy kernel input");
+  }
+  char* byte_size_buffer = backend_memory->MemoryPtr();
+
+  // byte_size_offset_buffer
+  size_t byte_size_offset_buffer_byte_size =
+      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);
+  err = BackendMemory::Create(
+      memory_manager_, alloc_types, tensor_memory_type_id,
+      byte_size_offset_buffer_byte_size, &backend_memory);
+  if (err != nullptr) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("unable to create backend memory for type: ") +
+         TRITONSERVER_MemoryTypeString(tensor_memory_type) +
+         " id: " + std::to_string(tensor_memory_type_id) + ": " +
+         TRITONSERVER_ErrorMessage(err))
+            .c_str());
+    TRITONSERVER_ErrorDelete(err);
+  } else {
+    in_use_memories_.emplace_back(backend_memory);
+  }
+  if (backend_memory == nullptr ||
+      (backend_memory->MemoryType() != tensor_memory_type) ||
+      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Failed to obtain memory buffer for copy kernel input");
+  }
+  char* byte_size_offset_buffer = backend_memory->MemoryPtr();
+
+  cudaMemcpyAsync(
+      input_ptr_buffer, input_ptr_buffer_host.data(),
+      pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*),
+      cudaMemcpyDefault, stream_);
+  cudaMemcpyAsync(
+      byte_size_buffer, byte_size_buffer_host.data(),
+      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),
+      cudaMemcpyDefault, stream_);
+  cudaMemcpyAsync(
+      byte_size_offset_buffer, byte_size_offset_buffer_host.data(),
+      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),
+      cudaMemcpyDefault, stream_);
+  if (buffer_ready_event_ != nullptr) {
+    cudaEventSynchronize(buffer_ready_event_);
+    buffer_ready_event_ = nullptr;
+  }
+  RETURN_IF_CUDA_ERROR(
+      RunGatherKernel(
+          (const int8_t**)input_ptr_buffer, (const size_t*)byte_size_buffer,
+          (const size_t*)byte_size_offset_buffer,
+          (int8_t*)tensor_buffer + pending_copy_kernel_buffer_offset_,
+          pending_copy_kernel_input_buffer_counts_, stream_),
+      TRITONSERVER_ERROR_INTERNAL,
+      std::string("Failed to launch gather kernel"));
+  return nullptr;
+#else
+  return TRITONSERVER_ErrorNew(
+      TRITONSERVER_ERROR_UNSUPPORTED,
+      "Copy kernel can not be launched with TRITON_ENABLE_GPU=OFF");
+#endif  // TRITON_ENABLE_GPU
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_memory.cc
+++ b/3rdparty/backend-r22.12/src/backend_memory.cc
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_memory.h"
+
+#include <map>
+#include "triton/backend/backend_common.h"
+
+namespace triton { namespace backend {
+
+TRITONSERVER_Error*
+BackendMemory::Create(
+    TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
+    const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)
+{
+  *mem = nullptr;
+
+  void* ptr = nullptr;
+  switch (alloc_type) {
+    case AllocationType::CPU_PINNED: {
+#ifdef TRITON_ENABLE_GPU
+      RETURN_IF_CUDA_ERROR(
+          cudaHostAlloc(&ptr, byte_size, cudaHostAllocPortable),
+          TRITONSERVER_ERROR_UNAVAILABLE,
+          std::string("failed to allocate pinned system memory"));
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "pinned-memory allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case AllocationType::GPU: {
+#ifdef TRITON_ENABLE_GPU
+      int current_device;
+      RETURN_IF_CUDA_ERROR(
+          cudaGetDevice(&current_device), TRITONSERVER_ERROR_INTERNAL,
+          std::string("failed to get device"));
+      bool overridden = (current_device != memory_type_id);
+      if (overridden) {
+        RETURN_IF_CUDA_ERROR(
+            cudaSetDevice(memory_type_id), TRITONSERVER_ERROR_INTERNAL,
+            std::string("failed to set device"));
+      }
+
+      auto err = cudaMalloc(&ptr, byte_size);
+
+      if (overridden) {
+        LOG_IF_CUDA_ERROR(
+            cudaSetDevice(current_device), "failed to set CUDA device");
+      }
+
+      RETURN_ERROR_IF_FALSE(
+          err == cudaSuccess, TRITONSERVER_ERROR_UNAVAILABLE,
+          std::string("failed to allocate GPU memory: ") +
+              cudaGetErrorString(err));
+#else
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED, "GPU allocation not supported");
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+
+    case AllocationType::CPU:
+    case AllocationType::CPU_PINNED_POOL:
+    case AllocationType::GPU_POOL:
+      RETURN_IF_ERROR(TRITONBACKEND_MemoryManagerAllocate(
+          manager, &ptr, AllocTypeToMemoryType(alloc_type), memory_type_id,
+          byte_size));
+      break;
+  }
+
+  *mem = new BackendMemory(
+      manager, alloc_type, memory_type_id, reinterpret_cast<char*>(ptr),
+      byte_size);
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+BackendMemory::Create(
+    TRITONBACKEND_MemoryManager* manager,
+    const std::vector<AllocationType>& alloc_types,
+    const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)
+{
+  *mem = nullptr;
+  RETURN_ERROR_IF_TRUE(
+      alloc_types.size() == 0, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("BackendMemory::Create, at least one allocation type must be "
+                  "specified"));
+
+  bool success = false;
+  std::unordered_map<AllocationType, TRITONSERVER_Error*> errors;
+  for (const AllocationType alloc_type : alloc_types) {
+    TRITONSERVER_Error* err =
+        Create(manager, alloc_type, memory_type_id, byte_size, mem);
+    if (err == nullptr) {
+      success = true;
+      break;
+    }
+
+    errors.insert({alloc_type, err});
+  }
+
+  // If allocation failed for all allocation types then display all
+  // the error messages and show the entire allocation request as
+  // failing.
+  if (!success) {
+    std::string msg = "BackendMemory::Create, all allocation types failed:";
+    for (const auto& pr : errors) {
+      const AllocationType alloc_type = pr.first;
+      TRITONSERVER_Error* err = pr.second;
+      msg += std::string("\n\t") + AllocTypeString(alloc_type) + ": " +
+             TRITONSERVER_ErrorMessage(err);
+      TRITONSERVER_ErrorDelete(err);
+    }
+
+    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNAVAILABLE, msg.c_str());
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+BackendMemory::Create(
+    TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
+    const int64_t memory_type_id, void* buffer, const size_t byte_size,
+    BackendMemory** mem)
+{
+  *mem = new BackendMemory(
+      manager, alloc_type, memory_type_id, reinterpret_cast<char*>(buffer),
+      byte_size, false /* owns_buffer */);
+
+  return nullptr;  // success
+}
+
+BackendMemory::~BackendMemory()
+{
+  if (owns_buffer_) {
+    switch (alloctype_) {
+      case AllocationType::CPU_PINNED:
+#ifdef TRITON_ENABLE_GPU
+        if (buffer_ != nullptr) {
+          LOG_IF_CUDA_ERROR(
+              cudaFreeHost(buffer_), "failed to free pinned memory");
+        }
+#endif  // TRITON_ENABLE_GPU
+        break;
+
+      case AllocationType::GPU:
+#ifdef TRITON_ENABLE_GPU
+        if (buffer_ != nullptr) {
+          LOG_IF_CUDA_ERROR(cudaFree(buffer_), "failed to free CUDA memory");
+        }
+#endif  // TRITON_ENABLE_GPU
+        break;
+
+      case AllocationType::CPU:
+      case AllocationType::CPU_PINNED_POOL:
+      case AllocationType::GPU_POOL:
+        LOG_IF_ERROR(
+            TRITONBACKEND_MemoryManagerFree(
+                manager_, buffer_, AllocTypeToMemoryType(alloctype_),
+                memtype_id_),
+            "failed to free memory buffer");
+        break;
+    }
+  }
+}
+
+TRITONSERVER_MemoryType
+BackendMemory::AllocTypeToMemoryType(const AllocationType a)
+{
+  switch (a) {
+    case AllocationType::CPU:
+      return TRITONSERVER_MEMORY_CPU;
+    case AllocationType::CPU_PINNED:
+    case AllocationType::CPU_PINNED_POOL:
+      return TRITONSERVER_MEMORY_CPU_PINNED;
+    case AllocationType::GPU:
+    case AllocationType::GPU_POOL:
+      return TRITONSERVER_MEMORY_GPU;
+  }
+
+  return TRITONSERVER_MEMORY_CPU;  // unreachable
+}
+
+const char*
+BackendMemory::AllocTypeString(const AllocationType a)
+{
+  switch (a) {
+    case AllocationType::CPU:
+      return "CPU";
+    case AllocationType::CPU_PINNED:
+      return "CPU_PINNED";
+    case AllocationType::GPU:
+      return "GPU";
+    case AllocationType::CPU_PINNED_POOL:
+      return "CPU_PINNED_POOL";
+    case AllocationType::GPU_POOL:
+      return "GPU_POOL";
+  }
+
+  return "<unknown>";
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_model.cc
+++ b/3rdparty/backend-r22.12/src/backend_model.cc
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_model.h"
+
+#include "triton/backend/backend_common.h"
+
+namespace triton { namespace backend {
+
+//
+// BackendModel
+//
+BackendModel::BackendModel(
+    TRITONBACKEND_Model* triton_model, const bool allow_optional)
+    : triton_model_(triton_model), allow_optional_(allow_optional)
+{
+  const char* model_name;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelName(triton_model, &model_name));
+  name_ = model_name;
+
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelVersion(triton_model, &version_));
+
+  const char* repository_path = nullptr;
+  TRITONBACKEND_ArtifactType repository_artifact_type;
+  THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_ModelRepository(
+      triton_model, &repository_artifact_type, &repository_path));
+  if (repository_artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) {
+    throw BackendModelException(TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        (std::string("unsupported repository artifact type for model '") +
+         model_name + "'")
+            .c_str()));
+  }
+  repository_path_ = repository_path;
+
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelServer(triton_model, &triton_server_));
+  TRITONBACKEND_Backend* backend;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelBackend(triton_model, &backend));
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_BackendMemoryManager(backend, &triton_memory_manager_));
+
+  THROW_IF_BACKEND_MODEL_ERROR(ParseModelConfig());
+}
+
+TRITONSERVER_Error*
+BackendModel::ParseModelConfig()
+{
+  TRITONSERVER_Message* config_message;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(
+      triton_model_, 1 /* config_version */, &config_message));
+
+  // Get the model configuration as a json string from
+  // config_message. We use TritonJson, which is a wrapper that
+  // returns nice errors (currently the underlying implementation is
+  // rapidjson... but others could be added).
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_ERROR(
+      TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));
+
+  TRITONSERVER_Error* err = model_config_.Parse(buffer, byte_size);
+  RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));
+  RETURN_IF_ERROR(err);
+
+  int64_t mbs = 0;
+  RETURN_IF_ERROR(model_config_.MemberAsInt("max_batch_size", &mbs));
+  max_batch_size_ = mbs;
+
+  enable_pinned_input_ = false;
+  enable_pinned_output_ = false;
+  {
+    common::TritonJson::Value optimization;
+    if (model_config_.Find("optimization", &optimization)) {
+      common::TritonJson::Value pinned_memory;
+      if (optimization.Find("input_pinned_memory", &pinned_memory)) {
+        RETURN_IF_ERROR(
+            pinned_memory.MemberAsBool("enable", &enable_pinned_input_));
+      }
+      if (optimization.Find("output_pinned_memory", &pinned_memory)) {
+        RETURN_IF_ERROR(
+            pinned_memory.MemberAsBool("enable", &enable_pinned_output_));
+      }
+    }
+  }
+
+  RETURN_IF_ERROR(
+      BatchInput::ParseFromModelConfig(model_config_, &batch_inputs_));
+  RETURN_IF_ERROR(
+      BatchOutput::ParseFromModelConfig(model_config_, &batch_outputs_));
+  for (const auto& batch_output : batch_outputs_) {
+    for (const auto& name : batch_output.TargetNames()) {
+      batch_output_map_.emplace(name, &batch_output);
+    }
+  }
+  triton::common::TritonJson::Value config_inputs;
+  RETURN_IF_ERROR(model_config_.MemberAsArray("input", &config_inputs));
+  for (size_t i = 0; i < config_inputs.ArraySize(); i++) {
+    triton::common::TritonJson::Value io;
+    RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io));
+    std::string io_name;
+    RETURN_IF_ERROR(io.MemberAsString("name", &io_name));
+    triton::common::TritonJson::Value input_property_json;
+    bool allow_ragged_batch = false;
+    if (io.Find("allow_ragged_batch", &input_property_json)) {
+      RETURN_IF_ERROR(input_property_json.AsBool(&allow_ragged_batch));
+    }
+    if (allow_ragged_batch) {
+      ragged_inputs_.emplace(io_name);
+    }
+    bool optional = false;
+    if (io.Find("optional", &input_property_json)) {
+      RETURN_IF_ERROR(input_property_json.AsBool(&optional));
+    }
+    if (optional) {
+      if (allow_optional_) {
+        optional_inputs_.emplace(io_name);
+      } else {
+        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            (std::string("'optional' is set to true for input '") + io_name +
+             "' while the backend model doesn't support optional input")
+                .c_str()));
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+BackendModel::SetModelConfig()
+{
+  triton::common::TritonJson::WriteBuffer json_buffer;
+  RETURN_IF_ERROR(ModelConfig().Write(&json_buffer));
+
+  TRITONSERVER_Message* message;
+  RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(
+      &message, json_buffer.Base(), json_buffer.Size()));
+  RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(
+      triton_model_, 1 /* config_version */, message));
+  RETURN_IF_ERROR(TRITONSERVER_MessageDelete(message));
+
+  // Triton core can normalize the missing config settings
+  // in the above call. We must retrieve the updated model
+  // configration from the core.
+  RETURN_IF_ERROR(ParseModelConfig());
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+BackendModel::SupportsFirstDimBatching(bool* supports)
+{
+  *supports = max_batch_size_ > 0;
+  return nullptr;
+}
+
+const BatchOutput*
+BackendModel::FindBatchOutput(const std::string& output_name) const
+{
+  const auto it = batch_output_map_.find(output_name);
+  return ((it == batch_output_map_.end()) ? nullptr : it->second);
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_model_instance.cc
+++ b/3rdparty/backend-r22.12/src/backend_model_instance.cc
+// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_model_instance.h"
+
+#include <vector>
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_model.h"
+
+namespace triton { namespace backend {
+
+//
+// BackendModelInstance
+//
+BackendModelInstance::BackendModelInstance(
+    BackendModel* backend_model,
+    TRITONBACKEND_ModelInstance* triton_model_instance)
+    : backend_model_(backend_model),
+      triton_model_instance_(triton_model_instance)
+{
+  const char* instance_name;
+  THROW_IF_BACKEND_INSTANCE_ERROR(
+      TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name));
+  name_ = instance_name;
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(
+      TRITONBACKEND_ModelInstanceKind(triton_model_instance, &kind_));
+
+  THROW_IF_BACKEND_INSTANCE_ERROR(
+      TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &device_id_));
+
+  common::TritonJson::Value& model_config = backend_model->ModelConfig();
+
+  // If the model configuration specifies a 'default_model_filename'
+  // and/or specifies 'cc_model_filenames' then determine the
+  // appropriate 'artifact_filename' value. If model configuration
+  // does not specify then just leave 'artifact_filename' empty and
+  // the backend can then provide its own logic for determine the
+  // filename if that is appropriate.
+  THROW_IF_BACKEND_INSTANCE_ERROR(model_config.MemberAsString(
+      "default_model_filename", &artifact_filename_));
+
+  switch (kind_) {
+    case TRITONSERVER_INSTANCEGROUPKIND_CPU: {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_VERBOSE,
+          (std::string("Creating instance ") + name_ +
+           " on CPU using artifact '" + artifact_filename_ + "'")
+              .c_str());
+      break;
+    }
+    case TRITONSERVER_INSTANCEGROUPKIND_MODEL: {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_VERBOSE,
+          (std::string("Creating instance ") + name_ +
+           " on model-specified devices using artifact '" + artifact_filename_ +
+           "'")
+              .c_str());
+      break;
+    }
+    case TRITONSERVER_INSTANCEGROUPKIND_GPU: {
+#if defined(TRITON_ENABLE_GPU)
+      cudaDeviceProp cuprops;
+      cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, device_id_);
+      if (cuerr != cudaSuccess) {
+        throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("unable to get CUDA device properties for ") + name_ +
+             ": " + cudaGetErrorString(cuerr))
+                .c_str()));
+      }
+
+      const std::string cc =
+          std::to_string(cuprops.major) + "." + std::to_string(cuprops.minor);
+      common::TritonJson::Value cc_names;
+      common::TritonJson::Value cc_name;
+      if ((model_config.Find("cc_model_filenames", &cc_names)) &&
+          (cc_names.Find(cc.c_str(), &cc_name))) {
+        cc_name.AsString(&artifact_filename_);
+      }
+
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_VERBOSE,
+          (std::string("Creating instance ") + name_ + " on GPU " +
+           std::to_string(device_id_) + " (" + cc + ") using artifact '" +
+           artifact_filename_ + "'")
+              .c_str());
+#elif !defined(TRITON_ENABLE_MALI_GPU)
+      throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "GPU instances not supported"));
+#endif  // TRITON_ENABLE_GPU
+      break;
+    }
+    default: {
+      throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unexpected instance kind for ") + name_).c_str()));
+    }
+  }
+
+  stream_ = nullptr;
+  if (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU) {
+    THROW_IF_BACKEND_INSTANCE_ERROR(
+        CreateCudaStream(device_id_, 0 /* cuda_stream_priority */, &stream_));
+  }
+
+  // Get the host policy setting as a json string from message,
+  // and extract the host policy name for the instance.
+  TRITONSERVER_Message* message = nullptr;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONBACKEND_ModelInstanceHostPolicy(triton_model_instance_, &message));
+  const char* buffer;
+  size_t byte_size;
+  THROW_IF_BACKEND_MODEL_ERROR(
+      TRITONSERVER_MessageSerializeToJson(message, &buffer, &byte_size));
+
+  common::TritonJson::Value host_policy;
+  TRITONSERVER_Error* err = host_policy.Parse(buffer, byte_size);
+  THROW_IF_BACKEND_MODEL_ERROR(err);
+  std::vector<std::string> host_policy_name;
+  THROW_IF_BACKEND_MODEL_ERROR(host_policy.Members(&host_policy_name));
+  if (host_policy_name.size() != 1) {
+    throw BackendModelInstanceException(TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("unexpected no host policy for ") + name_).c_str()));
+  }
+  host_policy_name_ = host_policy_name[0];
+}
+
+
+BackendModelInstance::~BackendModelInstance()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (stream_ != nullptr) {
+    cudaError_t err = cudaStreamDestroy(stream_);
+    if (err != cudaSuccess) {
+      TRITONSERVER_LogMessage(
+          TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,
+          (std::string("~BackendModelInstance: ") + name_ +
+           " failed to destroy cuda stream: " + cudaGetErrorString(err))
+              .c_str());
+    }
+    stream_ = nullptr;
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/backend_output_responder.cc
+++ b/3rdparty/backend-r22.12/src/backend_output_responder.cc
+// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_output_responder.h"
+
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+
+namespace triton { namespace backend {
+
+//
+// BackendOutputResponder
+//
+BackendOutputResponder::~BackendOutputResponder()
+{
+  for (auto& pinned_memory : pinned_memories_) {
+    LOG_IF_ERROR(
+        TRITONBACKEND_MemoryManagerFree(
+            memory_manager_, reinterpret_cast<void*>(pinned_memory),
+            TRITONSERVER_MEMORY_CPU_PINNED, 0),
+        "failed to free pinned memory");
+  }
+}
+
+void
+BackendOutputResponder::ProcessTensor(
+    const std::string& output_name, const TRITONSERVER_DataType datatype,
+    std::vector<int64_t>& batchn_shape, const char* buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+{
+  // A value of CPU_PINNED indicates that pinned memory buffer is not
+  // needed for this tensor. Any other value indicates that a pinned
+  // memory buffer is needed when the target memory type matches
+  // 'use_pinned_memory_type'.
+  TRITONSERVER_MemoryType use_pinned_memory_type =
+      TRITONSERVER_MEMORY_CPU_PINNED;
+  if (pinned_enabled_) {
+    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
+  }
+
+  const int64_t batchn_batch_size = batchn_shape[0];
+  int64_t batch_size_offset = 0;
+
+  size_t tensor_offset = 0;
+
+  for (size_t idx = 0; idx < responses_->size(); idx++) {
+    auto& request = requests_[idx];
+    auto& response = (*responses_)[idx];
+
+    // If then pending copies are from tensor buffer that is not
+    // contiguous with 'response's part of that buffer, then need to
+    // go ahead and perform the pending copies so that can start a
+    // new contiguous region if necessary.
+    if ((pending_pinned_byte_size_ > 0) &&
+        (tensor_offset !=
+         (pending_pinned_byte_size_ + pending_pinned_offset_))) {
+      need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+    }
+
+    // Override shape to be correct for this response.
+    if (first_dim_batching_) {
+      TRITONBACKEND_Input* input;
+      TRITONBACKEND_RequestInputByIndex(request, 0, &input);
+      const int64_t* shape;
+      TRITONBACKEND_InputProperties(
+          input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+      if ((batchn_batch_size != -1) &&
+          ((batch_size_offset + shape[0]) > batchn_batch_size)) {
+        if (response != nullptr) {
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response,
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_UNSUPPORTED,
+                  std::string(
+                      GetRequestId(request) +
+                      "failed to split the output tensor '" + output_name +
+                      "' in responses: expected batch size of atleast " +
+                      std::to_string(batch_size_offset + shape[0]) +
+                      " in model output, got " +
+                      std::to_string(batchn_batch_size))
+                      .c_str()));
+        }
+      }
+      batchn_shape[0] = shape[0];
+      batch_size_offset += shape[0];
+    }
+
+    const size_t tensor_byte_size = GetByteSize(datatype, batchn_shape);
+
+    TRITONBACKEND_Output* response_output;
+    if (response != nullptr) {
+      uint32_t output_count;
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response, TRITONBACKEND_RequestOutputCount(request, &output_count));
+      if (response != nullptr) {
+        for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) {
+          const char* name;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response,
+              TRITONBACKEND_RequestOutputName(request, output_idx, &name));
+          if ((response != nullptr) && (output_name == name)) {
+            RESPOND_AND_SET_NULL_IF_ERROR(
+                &response, TRITONBACKEND_ResponseOutput(
+                               response, &response_output, name, datatype,
+                               batchn_shape.data(), batchn_shape.size()));
+            if (response != nullptr) {
+              need_sync_ |= SetFixedSizeBuffer(
+                  &response, response_output, output_name, tensor_byte_size,
+                  tensor_offset, buffer, memory_type, memory_type_id,
+                  use_pinned_memory_type, false /* state */);
+            }
+
+            break;
+          }
+        }
+      }
+    }
+
+    tensor_offset += tensor_byte_size;
+  }
+
+  // Done with the tensor, flush any pending pinned copies.
+  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+#ifdef TRITON_ENABLE_GPU
+  if (need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+std::vector<TRITONBACKEND_State*>
+BackendOutputResponder::ProcessStateTensor(
+    const std::string& output_state_name, const TRITONSERVER_DataType datatype,
+    std::vector<int64_t>& batchn_shape, const char* buffer,
+    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
+{
+  // A value of CPU_PINNED indicates that pinned memory buffer is not
+  // needed for this tensor. Any other value indicates that a pinned
+  // memory buffer is needed when the target memory type matches
+  // 'use_pinned_memory_type'.
+  TRITONSERVER_MemoryType use_pinned_memory_type =
+      TRITONSERVER_MEMORY_CPU_PINNED;
+  if (pinned_enabled_) {
+    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
+  }
+
+  std::vector<TRITONBACKEND_State*> states;
+
+  const int64_t batchn_batch_size = batchn_shape[0];
+  int64_t batch_size_offset = 0;
+
+  size_t tensor_offset = 0;
+
+  for (size_t idx = 0; idx < responses_->size(); idx++) {
+    auto& request = requests_[idx];
+    auto& response = (*responses_)[idx];
+
+    // If then pending copies are from tensor buffer that is not
+    // contiguous with 'response's part of that buffer, then need to
+    // go ahead and perform the pending copies so that can start a
+    // new contiguous region if necessary.
+    if ((pending_pinned_byte_size_ > 0) &&
+        (tensor_offset !=
+         (pending_pinned_byte_size_ + pending_pinned_offset_))) {
+      need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+    }
+
+    // Override shape to be correct for this response.
+    if (first_dim_batching_) {
+      TRITONBACKEND_Input* input;
+      TRITONBACKEND_RequestInputByIndex(request, 0, &input);
+      const int64_t* shape;
+      TRITONBACKEND_InputProperties(
+          input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+      if ((batchn_batch_size != -1) &&
+          ((batch_size_offset + shape[0]) > batchn_batch_size)) {
+        if (response != nullptr) {
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response,
+              TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ERROR_UNSUPPORTED,
+                  std::string(
+                      GetRequestId(request) +
+                      "failed to split the output state tensor '" +
+                      output_state_name +
+                      "' in responses: expected batch size of atleast " +
+                      std::to_string(batch_size_offset + shape[0]) +
+                      " in model output, got " +
+                      std::to_string(batchn_batch_size))
+                      .c_str()));
+        }
+      }
+      batchn_shape[0] = shape[0];
+      batch_size_offset += shape[0];
+    }
+
+    const size_t tensor_byte_size = GetByteSize(datatype, batchn_shape);
+
+    TRITONBACKEND_State* output_state;
+    if (response != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          &response, TRITONBACKEND_StateNew(
+                         &output_state, request, output_state_name.c_str(),
+                         datatype, batchn_shape.data(), batchn_shape.size()));
+      if (response != nullptr) {
+        states.push_back(output_state);
+        need_sync_ |= SetFixedSizeBuffer(
+            &response, output_state, output_state_name, tensor_byte_size,
+            tensor_offset, buffer, memory_type, memory_type_id,
+            use_pinned_memory_type, true /* state */);
+      }
+    }
+
+    tensor_offset += tensor_byte_size;
+  }
+
+  // Done with the tensor, flush any pending pinned copies.
+  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+#ifdef TRITON_ENABLE_GPU
+  if (need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  return states;
+}
+
+bool
+BackendOutputResponder::Finalize()
+{
+#ifdef TRITON_ENABLE_GPU
+  if ((!deferred_pinned_.empty()) && need_sync_) {
+    if (event_ != nullptr) {
+      cudaEventSynchronize(event_);
+    } else {
+      cudaStreamSynchronize(stream_);
+    }
+    need_sync_ = false;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  // After the above sync all the GPU->pinned copies are complete. Any
+  // deferred copies of pinned->CPU can now be done.
+  for (auto& def : deferred_pinned_) {
+    auto pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;
+    int64_t pinned_memory_id = 0;
+    char* pinned_buffer = def.pinned_memory_;
+
+    size_t offset = 0;
+    for (auto& pr : def.responses_) {
+      auto& response = pr.first;
+      auto& response_output = pr.second;
+
+      bool cuda_used = false;
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          response,
+          CopyBuffer(
+              response_output.name_, pinned_memory_type, pinned_memory_id,
+              response_output.memory_type_, response_output.memory_type_id_,
+              response_output.buffer_byte_size_, pinned_buffer + offset,
+              const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
+              copy_on_stream_));
+      need_sync_ |= cuda_used;
+
+      offset += response_output.buffer_byte_size_;
+    }
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  // Record the new event location if deferred copies occur
+  if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+  deferred_pinned_.clear();
+
+  return need_sync_;
+}
+
+
+bool
+BackendOutputResponder::SetFixedSizeBuffer(
+    TRITONBACKEND_Response** response, void* response_output_or_state,
+    const std::string& output_name, const size_t tensor_byte_size,
+    const size_t tensor_offset, const char* tensor_buffer,
+    const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id,
+    const TRITONSERVER_MemoryType use_pinned_memory_type, bool state)
+{
+  void* buffer = nullptr;
+  bool cuda_copy = false;
+
+  TRITONSERVER_MemoryType actual_memory_type = tensor_memory_type;
+  int64_t actual_memory_type_id = tensor_memory_type_id;
+
+  if (state) {
+    TRITONBACKEND_State* response_state =
+        reinterpret_cast<TRITONBACKEND_State*>(response_output_or_state);
+    auto err = TRITONBACKEND_StateBuffer(
+        response_state, &buffer, tensor_byte_size, &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  } else {
+    TRITONBACKEND_Output* response_output =
+        reinterpret_cast<TRITONBACKEND_Output*>(response_output_or_state);
+    auto err = TRITONBACKEND_OutputBuffer(
+        response_output, &buffer, tensor_byte_size, &actual_memory_type,
+        &actual_memory_type_id);
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
+
+  // If the response buffer matches the memory type that should use an
+  // intermediate pinned memory buffer for the transfer, then just
+  // record the response as pending and increase the size required for
+  // the intermediate pinned buffer.
+  if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&
+      (actual_memory_type == use_pinned_memory_type)) {
+    if (pending_pinned_byte_size_ == 0) {
+      pending_pinned_offset_ = tensor_offset;
+    }
+
+    pending_pinned_byte_size_ += tensor_byte_size;
+    pending_pinned_outputs_.push_back(std::make_pair(
+        response, OutputData(
+                      output_name, buffer, tensor_byte_size, actual_memory_type,
+                      actual_memory_type_id)));
+  } else {
+    // Direct copy without intermediate pinned memory.
+    bool cuda_used = false;
+    auto err = CopyBuffer(
+        output_name, tensor_memory_type, tensor_memory_type_id,
+        actual_memory_type, actual_memory_type_id, tensor_byte_size,
+        tensor_buffer + tensor_offset, buffer, stream_, &cuda_used,
+        copy_on_stream_);
+    cuda_copy |= cuda_used;
+
+    if (err != nullptr) {
+      RESPOND_AND_SET_NULL_IF_ERROR(response, err);
+      return cuda_copy;
+    }
+  }
+
+  return cuda_copy;
+}
+
+bool
+BackendOutputResponder::FlushPendingPinned(
+    const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type,
+    const int64_t tensor_memory_type_id)
+{
+  bool cuda_copy = false;
+
+  // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU
+
+  // Attempt to allocate a pinned buffer to use for staging the
+  // copy... if we fail to allocated the pinned buffer then we just
+  // directly go CPU->GPU or GPU->CPU.
+  char* pinned_memory = nullptr;
+  if (pending_pinned_byte_size_ > 0) {
+    TRITONSERVER_Error* err = TRITONBACKEND_MemoryManagerAllocate(
+        memory_manager_, reinterpret_cast<void**>(&pinned_memory),
+        TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,
+        pending_pinned_byte_size_);
+    if (err != nullptr) {
+      pinned_memory = nullptr;
+      TRITONSERVER_ErrorDelete(err);
+    }
+  }
+
+  // If the pinned buffer wasn't actually allocated then just perform
+  // a direct copy.
+  if (pinned_memory == nullptr) {
+    size_t offset = 0;
+    for (auto& pr : pending_pinned_outputs_) {
+      auto& response = pr.first;
+      auto& response_output = pr.second;
+
+      bool cuda_used = false;
+      RESPOND_AND_SET_NULL_IF_ERROR(
+          response,
+          CopyBuffer(
+              response_output.name_, tensor_memory_type, tensor_memory_type_id,
+              response_output.memory_type_, response_output.memory_type_id_,
+              response_output.buffer_byte_size_,
+              tensor_buffer + pending_pinned_offset_ + offset,
+              const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
+              copy_on_stream_));
+      cuda_copy |= cuda_used;
+
+      offset += response_output.buffer_byte_size_;
+    }
+  }
+  // We have a pinned buffer so do a single copy of a block of tensor
+  // data to the pinned buffer.
+  else {  // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED
+    bool cuda_used = false;
+    auto err = CopyBuffer(
+        "pinned buffer", tensor_memory_type, tensor_memory_type_id,
+        TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,
+        pending_pinned_byte_size_, tensor_buffer + pending_pinned_offset_,
+        pinned_memory, stream_, &cuda_used, copy_on_stream_);
+    cuda_copy |= cuda_used;
+
+    // If something goes wrong with the copy all the pending
+    // responses fail...
+    if (err != nullptr) {
+      for (auto& pr : pending_pinned_outputs_) {
+        auto& response = pr.first;
+        if (*response != nullptr) {
+          LOG_IF_ERROR(
+              TRITONBACKEND_ResponseSend(
+                  *response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),
+              "failed to send TensorFlow error response");
+          *response = nullptr;
+        }
+      }
+      TRITONSERVER_ErrorDelete(err);
+    }
+
+    // If the copy was not async (i.e. if tensor was in CPU so a
+    // CPU->CPU-PINNED copy was performed above), then the pinned
+    // buffer now holds the tensor contents and we can immediately
+    // issue the copies from the pinned buffer to the
+    // responses.
+    //
+    // Otherwise the GPU->CPU-PINNED async copies are in flight and we
+    // simply remember the pinned buffer and the corresponding
+    // response outputs so that we can do the pinned->CPU copies in
+    // finalize after we have waited for all async copies to complete.
+    if (!cuda_used) {
+      size_t offset = 0;
+      for (auto& pr : pending_pinned_outputs_) {
+        auto& response = pr.first;
+        auto& response_output = pr.second;
+
+        bool cuda_used = false;
+        RESPOND_AND_SET_NULL_IF_ERROR(
+            response,
+            CopyBuffer(
+                response_output.name_, TRITONSERVER_MEMORY_CPU_PINNED,
+                0 /* memory_type_id */, response_output.memory_type_,
+                response_output.memory_type_id_,
+                response_output.buffer_byte_size_, pinned_memory + offset,
+                const_cast<void*>(response_output.buffer_), stream_, &cuda_used,
+                copy_on_stream_));
+        cuda_copy |= cuda_used;
+
+        offset += response_output.buffer_byte_size_;
+      }
+    } else {
+      deferred_pinned_.emplace_back(
+          pinned_memory, pending_pinned_byte_size_,
+          std::move(pending_pinned_outputs_));
+    }
+  }
+
+  // Pending pinned copies are handled...
+  pending_pinned_byte_size_ = 0;
+  pending_pinned_offset_ = 0;
+  pending_pinned_outputs_.clear();
+
+  // Need to hold on to the allocated pinned buffer as there are still
+  // copies in flight. Will delete it in finalize.
+  if (pinned_memory != nullptr) {
+    pinned_memories_.push_back(pinned_memory);
+  }
+
+  return cuda_copy;
+}
+
+void
+BackendOutputResponder::ProcessBatchOutput(
+    const std::string& name, const BatchOutput& batch_output,
+    const char* buffer, const TRITONSERVER_MemoryType memory_type,
+    const int64_t memory_type_id)
+{
+  // A value of CPU_PINNED indicates that pinned memory buffer is not
+  // needed for this tensor. Any other value indicates that a pinned
+  // memory buffer is needed when the target memory type matches
+  // 'use_pinned_memory_type'.
+  TRITONSERVER_MemoryType use_pinned_memory_type =
+      TRITONSERVER_MEMORY_CPU_PINNED;
+  if (pinned_enabled_) {
+    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);
+  }
+
+  // Batch output may be processed differently based on the kind
+  switch (batch_output.BatchOutputKind()) {
+    case BatchOutput::Kind::BATCH_SCATTER_WITH_INPUT_SHAPE: {
+      const auto& output_name = batch_output.TargetNames()[0];
+      const auto& input_name = batch_output.SourceInputs()[0];
+      const auto& datatype = batch_output.DataType();
+      size_t tensor_offset = 0;
+
+      for (size_t idx = 0; idx < responses_->size(); idx++) {
+        auto& request = requests_[idx];
+        auto& response = (*responses_)[idx];
+
+        // If then pending copies are from tensor buffer that is not
+        // contiguous with 'response's part of that buffer, then need to
+        // go ahead and perform the pending copies so that can start a
+        // new contiguous region if necessary.
+        if ((pending_pinned_byte_size_ > 0) &&
+            (tensor_offset !=
+             (pending_pinned_byte_size_ + pending_pinned_offset_))) {
+          need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+        }
+
+        // Override shape to be correct for this response, with a naive
+        // assumption that the dynamic dimension in output is mapped to the same
+        // dimension in the input
+        auto output_batchn_shape = batch_output.OutputShape();
+        {
+          TRITONBACKEND_Input* input;
+          TRITONBACKEND_RequestInput(request, input_name.c_str(), &input);
+          const int64_t* shape;
+          TRITONBACKEND_InputProperties(
+              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+          for (size_t dim_idx = 0; dim_idx < output_batchn_shape.size();
+               dim_idx++) {
+            if (output_batchn_shape[dim_idx] == -1) {
+              output_batchn_shape[dim_idx] = shape[dim_idx];
+            }
+          }
+        }
+
+        const size_t tensor_byte_size =
+            GetByteSize(datatype, output_batchn_shape);
+
+        TRITONBACKEND_Output* response_output;
+        if (response != nullptr) {
+          uint32_t output_count;
+          RESPOND_AND_SET_NULL_IF_ERROR(
+              &response,
+              TRITONBACKEND_RequestOutputCount(request, &output_count));
+          if (response != nullptr) {
+            for (uint32_t output_idx = 0; output_idx < output_count;
+                 output_idx++) {
+              const char* name;
+              RESPOND_AND_SET_NULL_IF_ERROR(
+                  &response,
+                  TRITONBACKEND_RequestOutputName(request, output_idx, &name));
+              if ((response != nullptr) && (output_name == name)) {
+                RESPOND_AND_SET_NULL_IF_ERROR(
+                    &response, TRITONBACKEND_ResponseOutput(
+                                   response, &response_output, name, datatype,
+                                   output_batchn_shape.data(),
+                                   output_batchn_shape.size()));
+                if (response != nullptr) {
+                  need_sync_ |= SetFixedSizeBuffer(
+                      &response, response_output, output_name, tensor_byte_size,
+                      tensor_offset, buffer, memory_type, memory_type_id,
+                      use_pinned_memory_type, false /* state */);
+                }
+
+                break;
+              }
+            }
+          }
+        }
+
+        tensor_offset += tensor_byte_size;
+      }
+      break;
+    }
+  }
+
+  // Done with the tensor, flush any pending pinned copies.
+  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);
+#ifdef TRITON_ENABLE_GPU
+  if (need_sync_ && (event_ != nullptr)) {
+    cudaEventRecord(event_, stream_);
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/src/kernel.cu
+++ b/3rdparty/backend-r22.12/src/kernel.cu
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "kernel.h"
+
+#include <cuda.h>
+
+#define THREADBLOCK_SIZE 512
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void TritonGatherKernel(
+    const int8_t** __restrict input_ptr_buffer,
+    const size_t* __restrict byte_size_buffer,
+    const size_t* __restrict byte_size_offset_buffer,
+    int8_t* __restrict output_buffer)
+{
+  int request_idx = blockIdx.x;
+  int lane_id = threadIdx.x;
+  const int8_t* request_input_buffer = input_ptr_buffer[request_idx];
+  int byte_size = byte_size_buffer[request_idx];
+  int byte_size_offset = byte_size_offset_buffer[request_idx];
+
+  int8_t* output_buffer_with_offset = output_buffer + byte_size_offset;
+  if (((byte_size % 4) == 0) && (((uint64_t)request_input_buffer % 4) == 0) &&
+      (((uint64_t)output_buffer_with_offset % 4) == 0)) {
+    int32_t* input_4 = (int32_t*)request_input_buffer;
+    int32_t* output_4 = (int32_t*)output_buffer_with_offset;
+    int element_count = byte_size / 4;
+    for (int elem_id = lane_id; elem_id < element_count;
+         elem_id += THREADBLOCK_SIZE) {
+      output_4[elem_id] = input_4[elem_id];
+    }
+  } else {
+    for (int elem_id = lane_id; elem_id < byte_size;
+         elem_id += THREADBLOCK_SIZE) {
+      output_buffer_with_offset[elem_id] =
+          __ldg(request_input_buffer + elem_id);
+    }
+  }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cudaError_t
+RunGatherKernel(
+    const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,
+    const size_t* byte_size_offset_buffer, int8_t* output_buffer,
+    size_t request_count, cudaStream_t stream)
+{
+  TritonGatherKernel<<<request_count, THREADBLOCK_SIZE, 0, stream>>>(
+      input_ptr_buffer, byte_size_buffer, byte_size_offset_buffer,
+      output_buffer);
+  return cudaGetLastError();
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/backend-r22.12/src/kernel.h
+++ b/3rdparty/backend-r22.12/src/kernel.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cudaError_t RunGatherKernel(
+    const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,
+    const size_t* byte_size_offset_buffer, int8_t* output_buffer,
+    size_t request_count, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
--- a/3rdparty/common-r22.12/.clang-format
+++ b/3rdparty/common-r22.12/.clang-format
+---
+BasedOnStyle: Google
+
+IndentWidth: 2
+ContinuationIndentWidth: 4
+UseTab: Never
+MaxEmptyLinesToKeep: 2
+
+SortIncludes: true
+CompactNamespaces: true
+ReflowComments: true
+
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+AllowShortIfStatementsOnASingleLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+
+AlwaysBreakAfterReturnType: TopLevelDefinitions
+AlignAfterOpenBracket: AlwaysBreak
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: true
+
+BinPackArguments: true
+BinPackParameters: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+
+IndentCaseLabels: true
\ No newline at end of file
--- a/3rdparty/common-r22.12/.gitignore
+++ b/3rdparty/common-r22.12/.gitignore
+/build
+/.vscode
+*.so
--- a/3rdparty/common-r22.12/CMakeLists.txt
+++ b/3rdparty/common-r22.12/CMakeLists.txt
+# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.16)
+project(tritoncommon LANGUAGES C CXX)
+
+#
+# Options
+#
+# Some components are expensive to build and have extensive
+# dependencies, so those parts of the build must be enabled
+# explicitly.
+option(TRITON_COMMON_ENABLE_PROTOBUF "Build protobuf artifacts" OFF)
+option(TRITON_COMMON_ENABLE_PROTOBUF_PYTHON "Build protobuf artifacts for python" ON)
+option(TRITON_COMMON_ENABLE_GRPC "Build grpc artifacts" OFF)
+option(TRITON_COMMON_ENABLE_JSON "Build json-related libs" ON)
+#option(TRITON_COMMON_ENABLE_JSON "Build json-related libs" OFF)
+
+if(TRITON_COMMON_ENABLE_JSON)
+  find_package(RapidJSON CONFIG REQUIRED)
+  message(STATUS "RapidJSON found. Headers: ${RAPIDJSON_INCLUDE_DIRS}")
+endif()
+
+set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+find_package(Threads REQUIRED)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  message("Using MSVC as compiler, default target on Windows 10. "
+		  "If the target system is not Windows 10, please update _WIN32_WINNT "
+		  "to corresponding value.")
+endif()
+
+add_library(common-compile-settings INTERFACE)
+
+target_compile_features(common-compile-settings INTERFACE cxx_std_11)
+
+target_compile_options(common-compile-settings INTERFACE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>
+  $<$<CXX_COMPILER_ID:MSVC>:/W0 /D_WIN32_WINNT=0x0A00 /EHsc>
+)
+
+#
+# Error
+#
+add_library(
+  triton-common-error
+  src/error.cc
+  )
+
+add_library(
+  TritonCommon::triton-common-error ALIAS triton-common-error
+)
+
+target_include_directories(
+  triton-common-error
+  PUBLIC
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_link_libraries(triton-common-error PRIVATE common-compile-settings)
+
+#
+# Logging
+#
+add_library(
+  triton-common-logging
+  src/logging.cc
+)
+
+add_library(
+  TritonCommon::triton-common-logging ALIAS triton-common-logging
+)
+
+target_include_directories(
+  triton-common-logging
+  PUBLIC
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+if(${TRITON_ENABLE_LOGGING})
+  target_compile_definitions(
+    triton-common-logging
+    PRIVATE TRITON_ENABLE_LOGGING=1
+  )
+endif() # TRITON_ENABLE_LOGGING
+
+target_link_libraries(triton-common-logging PRIVATE common-compile-settings)
+
+#
+# SyncQueue
+#
+add_library(
+  triton-common-sync-queue INTERFACE
+)
+
+add_library(
+  TritonCommon::triton-common-sync-queue ALIAS triton-common-sync-queue
+)
+
+target_include_directories(
+  triton-common-sync-queue
+  INTERFACE
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+#
+# Async Work Queue
+#
+add_library(
+  triton-common-async-work-queue
+  src/async_work_queue.cc
+  src/error.cc
+  src/thread_pool.cc
+)
+
+add_library(
+  TritonCommon::triton-common-async-work-queue ALIAS  triton-common-async-work-queue
+)
+
+target_include_directories(
+  triton-common-async-work-queue
+  PUBLIC
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_link_libraries(triton-common-async-work-queue
+  PUBLIC
+    Threads::Threads
+  PRIVATE
+    common-compile-settings
+)
+
+#
+# Thread Pool 
+#
+add_library(
+  triton-common-thread-pool
+  src/thread_pool.cc
+)
+
+add_library(
+  TritonCommon::triton-common-thread-pool ALIAS  triton-common-thread-pool
+)
+
+target_include_directories(
+  triton-common-thread-pool
+  PUBLIC
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_link_libraries(triton-common-thread-pool
+  PUBLIC
+    Threads::Threads
+  PRIVATE
+    common-compile-settings
+)
+
+#
+# JSON utilities
+#
+if(TRITON_COMMON_ENABLE_JSON)
+  add_library(
+    triton-common-json INTERFACE
+  )
+
+  add_library(
+    TritonCommon::triton-common-json ALIAS triton-common-json
+  )
+
+  target_include_directories(
+    triton-common-json
+    INTERFACE
+      $<INSTALL_INTERFACE:include>
+      $<INSTALL_INTERFACE:${RAPIDJSON_INCLUDE_DIRS}>
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+      $<BUILD_INTERFACE:${RAPIDJSON_INCLUDE_DIRS}>
+  )
+endif()
+
+#
+# Table Printer
+#
+add_library(
+  triton-common-table-printer
+  src/table_printer.cc
+)
+
+add_library(
+  TritonBackend::triton-common-table-printer ALIAS triton-common-table-printer
+)
+
+target_include_directories(
+  triton-common-table-printer
+  PUBLIC
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+target_link_libraries(triton-common-table-printer PRIVATE common-compile-settings)
+
+set_target_properties(
+  triton-common-async-work-queue
+  triton-common-error
+  triton-common-logging
+  triton-common-table-printer
+  triton-common-thread-pool
+  PROPERTIES
+    WINDOWS_EXPORT_ALL_SYMBOLS TRUE
+    POSITION_INDEPENDENT_CODE ON
+)
+
+set_target_properties(
+  triton-common-async-work-queue
+  PROPERTIES
+    OUTPUT_NAME tritonasyncworkqueue
+)
+
+set_target_properties(
+  triton-common-thread-pool
+  PROPERTIES
+    OUTPUT_NAME tritonthreadpool
+)
+
+set_target_properties(
+  triton-common-error
+  PROPERTIES
+    OUTPUT_NAME tritoncommonerror
+)
+
+set_target_properties(
+  triton-common-logging
+  PROPERTIES
+    OUTPUT_NAME tritoncommonlogging
+)
+
+set_target_properties(
+  triton-common-table-printer
+  PROPERTIES
+    OUTPUT_NAME tritontableprinter
+)
+
+#
+# Protobuf and GRPC artifacts
+#
+if(${TRITON_COMMON_ENABLE_PROTOBUF} OR ${TRITON_COMMON_ENABLE_GRPC})
+  add_subdirectory(protobuf)
+
+  set(protobuf_MODULE_COMPATIBLE TRUE CACHE BOOL "protobuf_MODULE_COMPATIBLE" FORCE)
+  find_package(Protobuf CONFIG REQUIRED)
+  message(STATUS "Using protobuf ${Protobuf_VERSION}")
+
+  #
+  # Model Config (depends on protobuf & generated .pb.h file)
+  #
+  add_library(
+    triton-common-model-config
+    src/model_config.cc
+    )
+
+  add_library(
+    TritonCommon::triton-common-model-config ALIAS triton-common-model-config
+  )
+
+  target_include_directories(
+    triton-common-model-config
+    PUBLIC
+      $<INSTALL_INTERFACE:include>
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    PRIVATE
+      ${CMAKE_CURRENT_SOURCE_DIR}/src
+      ${Protobuf_INCLUDE_DIRS}
+  )
+
+  target_link_libraries(
+    triton-common-model-config
+    PRIVATE
+      common-compile-settings
+      protobuf::libprotobuf
+      proto-library
+  )
+
+  set_target_properties(
+    triton-common-model-config
+    PROPERTIES
+      WINDOWS_EXPORT_ALL_SYMBOLS TRUE
+      POSITION_INDEPENDENT_CODE ON
+      OUTPUT_NAME tritoncommonmodelconfig
+  )
+
+endif()
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonCommon)
+
+install(
+  TARGETS
+    triton-common-async-work-queue
+    triton-common-error
+    triton-common-logging
+    triton-common-sync-queue
+    triton-common-table-printer
+    triton-common-thread-pool
+    common-compile-settings
+  EXPORT
+    triton-common-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+if(TRITON_COMMON_ENABLE_JSON)
+  install(
+    TARGETS
+      triton-common-json
+    EXPORT
+      triton-common-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
+if(${TRITON_COMMON_ENABLE_GRPC} OR ${TRITON_COMMON_ENABLE_PROTOBUF})
+  install(
+    TARGETS
+      proto-library
+      triton-common-model-config
+#      proto-py-library
+    EXPORT
+      triton-common-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
+if(${TRITON_COMMON_ENABLE_GRPC})
+  install(
+    TARGETS
+      grpc-service-library
+#      grpc-service-py-library
+    EXPORT
+      triton-common-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
+install(
+  DIRECTORY include/
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
+install(
+  EXPORT
+    triton-common-targets
+  FILE
+    TritonCommonTargets.cmake
+  NAMESPACE
+    TritonCommon::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonCommonConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonCommonConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/TritonCommonConfig.cmake
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT
+    triton-common-targets
+  FILE
+    ${CMAKE_CURRENT_BINARY_DIR}/TritonCommonTargets.cmake
+  NAMESPACE
+    TritonCommon::
+)
+
+export(PACKAGE TritonCommon)
--- a/3rdparty/common-r22.12/LICENSE
+++ b/3rdparty/common-r22.12/LICENSE
+Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/common-r22.12/README.md
+++ b/3rdparty/common-r22.12/README.md
+<!--
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Triton Inference Server Common
+
+Common source, scripts and utilities shared across all Triton
+repositories.
+
+This repo is not typically built directly but is instead included in
+the build of other repos. To build directly first install the required
+dependencies.
+
+```
+$ apt-get install rapidjson-dev
+```
+
+Use cmake 3.17 or later to build and install in a local directory.
+
+```
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..
+$ make install
+```
--- a/3rdparty/common-r22.12/cmake/TritonCommonConfig.cmake.in
+++ b/3rdparty/common-r22.12/cmake/TritonCommonConfig.cmake.in
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@PACKAGE_INIT@
+
+set_and_check(TRITONCOMMON_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONCOMMON_CMAKE_DIR})
+
+include(CMakeFindDependencyMacro)
+find_dependency(Threads)
+
+if(NOT TARGET TritonCommon::triton-common-json)
+  include("${TRITONCOMMON_CMAKE_DIR}/TritonCommonTargets.cmake")
+endif()
+
+check_required_components(triton-common-json
+  triton-common-sync-queue
+  triton-common-async-work-queue
+  triton-common-thread-pool
+)
+
+set(TRITONCOMMON_LIBRARIES 
+  TritonCommon::triton-common-json
+  TritonCommon::triton-common-sync-queue
+  TritonCommon::triton-common-async-work-queue
+  TritonCommon::triton-common-thread-pool
+)
--- a/3rdparty/common-r22.12/include/triton/common/async_work_queue.h
+++ b/3rdparty/common-r22.12/include/triton/common/async_work_queue.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "error.h"
+#include "thread_pool.h"
+
+namespace triton { namespace common {
+// Manager for asynchronous worker threads. Use to accelerate copies and
+// other such operations by running them in parallel.
+// Call Initialize to start the worker threads (once) and AddTask to tasks to
+// the queue.
+
+class AsyncWorkQueue {
+ public:
+  // Start 'worker_count' number of worker threads.
+  static Error Initialize(size_t worker_count);
+
+  // Get the number of worker threads.
+  static size_t WorkerCount();
+
+  // Add a 'task' to the queue. The function will take ownership of 'task'.
+  // Therefore std::move should be used when calling AddTask.
+  static Error AddTask(std::function<void(void)>&& task);
+
+ protected:
+  static void Reset();
+
+ private:
+  AsyncWorkQueue() = default;
+  ~AsyncWorkQueue();
+  static AsyncWorkQueue* GetSingleton();
+  std::unique_ptr<ThreadPool> thread_pool_;
+};
+
+}}  // namespace triton::common
--- a/3rdparty/common-r22.12/include/triton/common/error.h
+++ b/3rdparty/common-r22.12/include/triton/common/error.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+
+namespace triton { namespace common {
+
+//
+// Error
+//
+// Error returned by utilities from common repo.
+//
+class Error {
+ public:
+  enum class Code {
+    SUCCESS,
+    UNKNOWN,
+    INTERNAL,
+    NOT_FOUND,
+    INVALID_ARG,
+    UNAVAILABLE,
+    UNSUPPORTED,
+    ALREADY_EXISTS
+  };
+
+  explicit Error(Code code = Code::SUCCESS) : code_(code) {}
+  explicit Error(Code code, const std::string& msg) : code_(code), msg_(msg) {}
+
+  // Convenience "success" value. Can be used as Error::Success to
+  // indicate no error.
+  static const Error Success;
+
+  // Return the code for this status.
+  Code ErrorCode() const { return code_; }
+
+  // Return the message for this status.
+  const std::string& Message() const { return msg_; }
+
+  // Return true if this status indicates "ok"/"success", false if
+  // status indicates some kind of failure.
+  bool IsOk() const { return code_ == Code::SUCCESS; }
+
+  // Return the status as a string.
+  std::string AsString() const;
+
+  // Return the constant string name for a code.
+  static const char* CodeString(const Code code);
+
+ protected:
+  Code code_;
+  std::string msg_;
+};
+
+}}  // namespace triton::common
--- a/3rdparty/common-r22.12/include/triton/common/logging.h
+++ b/3rdparty/common-r22.12/include/triton/common/logging.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <cerrno>
+#include <cstring>
+#include <fstream>
+
+namespace triton { namespace common {
+
+// A log message.
+class LogMessage {
+ public:
+  // Log levels.
+  enum Level { kERROR = 0, kWARNING = 1, kINFO = 2 };
+
+  LogMessage(const char* file, int line, uint32_t level);
+  ~LogMessage();
+
+  std::stringstream& stream() { return stream_; }
+
+ private:
+  static const std::vector<char> level_name_;
+  std::stringstream stream_;
+};
+
+// Global logger for messages. Controls how log messages are reported.
+class Logger {
+ public:
+  enum class Format { kDEFAULT, kISO8601 };
+
+  Logger();
+
+  // Is a log level enabled.
+  bool IsEnabled(LogMessage::Level level) const { return enables_[level]; }
+
+  // Set enable for a log Level.
+  void SetEnabled(LogMessage::Level level, bool enable)
+  {
+    enables_[level] = enable;
+  }
+
+  // Get the current verbose logging level.
+  uint32_t VerboseLevel() const { return vlevel_; }
+
+  // Set the current verbose logging level.
+  void SetVerboseLevel(uint32_t vlevel) { vlevel_ = vlevel; }
+
+  // Get the logging format.
+  Format LogFormat() { return format_; }
+
+  // Get the logging format as a string.
+  std::string LogFormatString()
+  {
+    switch (format_) {
+      case Format::kISO8601:
+        return "ISO8601";
+      case Format::kDEFAULT:
+        return "default";
+      default:
+        return "Invalid format";
+    }
+  }
+
+  // Set the logging format.
+  void SetLogFormat(Format format) { format_ = format; }
+
+  // Get the log output file name.
+  const std::string& LogFile() { return filename_; }
+
+  // Set the log output file. Returns an empty string upon
+  // success, else returns an error string.
+  const std::string SetLogFile(const std::string& filename)
+  {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    file_stream_.close();
+    std::string revert_name(filename_);
+    filename_ = filename;
+    if (!filename_.empty()) {
+      file_stream_.open(filename_, std::ios::app);
+      if (file_stream_.fail()) {
+        std::stringstream error;
+        error << __FILE__ << " " << __LINE__
+              << ": Failed to open log file: " << std::strerror(errno)
+              << std::endl;
+        filename_ = revert_name;
+        file_stream_.open(filename_, std::ios::app);
+        return error.str();
+      }
+    }
+    // will return an empty string
+    return std::string();
+  }
+
+  // Log a message.
+  void Log(const std::string& msg);
+
+  // Flush the log.
+  void Flush();
+
+ private:
+  std::vector<bool> enables_;
+  uint32_t vlevel_;
+  Format format_;
+  std::mutex mutex_;
+  std::string filename_;
+  std::ofstream file_stream_;
+};
+
+extern Logger gLogger_;
+
+#define LOG_ENABLE_INFO(E)             \
+  triton::common::gLogger_.SetEnabled( \
+      triton::common::LogMessage::Level::kINFO, (E))
+#define LOG_ENABLE_WARNING(E)          \
+  triton::common::gLogger_.SetEnabled( \
+      triton::common::LogMessage::Level::kWARNING, (E))
+#define LOG_ENABLE_ERROR(E)            \
+  triton::common::gLogger_.SetEnabled( \
+      triton::common::LogMessage::Level::kERROR, (E))
+#define LOG_SET_VERBOSE(L)                  \
+  triton::common::gLogger_.SetVerboseLevel( \
+      static_cast<uint32_t>(std::max(0, (L))))
+#define LOG_SET_OUT_FILE(FN) triton::common::gLogger_.SetLogFile((FN))
+#define LOG_SET_FORMAT(F) triton::common::gLogger_.SetLogFormat((F))
+
+#define LOG_VERBOSE_LEVEL triton::common::gLogger_.VerboseLevel()
+#define LOG_FORMAT triton::common::gLogger_.LogFormat()
+#define LOG_FORMAT_STRING triton::common::gLogger_.LogFormatString()
+#define LOG_FILE triton::common::gLogger_.LogFile()
+
+#ifdef TRITON_ENABLE_LOGGING
+
+#define LOG_INFO_IS_ON \
+  triton::common::gLogger_.IsEnabled(triton::common::LogMessage::Level::kINFO)
+#define LOG_WARNING_IS_ON             \
+  triton::common::gLogger_.IsEnabled( \
+      triton::common::LogMessage::Level::kWARNING)
+#define LOG_ERROR_IS_ON \
+  triton::common::gLogger_.IsEnabled(triton::common::LogMessage::Level::kERROR)
+#define LOG_VERBOSE_IS_ON(L) (triton::common::gLogger_.VerboseLevel() >= (L))
+
+#else
+
+// If logging is disabled, define macro to be false to avoid further evaluation
+#define LOG_INFO_IS_ON false
+#define LOG_WARNING_IS_ON false
+#define LOG_ERROR_IS_ON false
+#define LOG_VERBOSE_IS_ON(L) false
+
+#endif  // TRITON_ENABLE_LOGGING
+
+// Macros that use explicitly given filename and line number.
+#define LOG_INFO_FL(FN, LN)                                      \
+  if (LOG_INFO_IS_ON)                                            \
+  triton::common::LogMessage(                                    \
+      (char*)(FN), LN, triton::common::LogMessage::Level::kINFO) \
+      .stream()
+#define LOG_WARNING_FL(FN, LN)                                      \
+  if (LOG_WARNING_IS_ON)                                            \
+  triton::common::LogMessage(                                       \
+      (char*)(FN), LN, triton::common::LogMessage::Level::kWARNING) \
+      .stream()
+#define LOG_ERROR_FL(FN, LN)                                      \
+  if (LOG_ERROR_IS_ON)                                            \
+  triton::common::LogMessage(                                     \
+      (char*)(FN), LN, triton::common::LogMessage::Level::kERROR) \
+      .stream()
+#define LOG_VERBOSE_FL(L, FN, LN)                                \
+  if (LOG_VERBOSE_IS_ON(L))                                      \
+  triton::common::LogMessage(                                    \
+      (char*)(FN), LN, triton::common::LogMessage::Level::kINFO) \
+      .stream()
+
+// Macros that use current filename and line number.
+#define LOG_INFO LOG_INFO_FL(__FILE__, __LINE__)
+#define LOG_WARNING LOG_WARNING_FL(__FILE__, __LINE__)
+#define LOG_ERROR LOG_ERROR_FL(__FILE__, __LINE__)
+#define LOG_VERBOSE(L) LOG_VERBOSE_FL(L, __FILE__, __LINE__)
+
+
+#define LOG_STATUS_ERROR(X, MSG)                         \
+  do {                                                   \
+    const Status& status__ = (X);                        \
+    if (!status__.IsOk()) {                              \
+      LOG_ERROR << (MSG) << ": " << status__.AsString(); \
+    }                                                    \
+  } while (false)
+
+#define LOG_TRITONSERVER_ERROR(X, MSG)                                  \
+  do {                                                                  \
+    TRITONSERVER_Error* err__ = (X);                                    \
+    if (err__ != nullptr) {                                             \
+      LOG_ERROR << (MSG) << ": " << TRITONSERVER_ErrorCodeString(err__) \
+                << " - " << TRITONSERVER_ErrorMessage(err__);           \
+      TRITONSERVER_ErrorDelete(err__);                                  \
+    }                                                                   \
+  } while (false)
+
+#define LOG_FLUSH triton::common::gLogger_.Flush()
+
+}}  // namespace triton::common
--- a/3rdparty/common-r22.12/include/triton/common/model_config.h
+++ b/3rdparty/common-r22.12/include/triton/common/model_config.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <google/protobuf/any.pb.h>
+#include <stdint.h>
+#include "model_config.pb.h"
+
+namespace triton { namespace common {
+
+/// The type for a repeated dims field (used for shape).
+using DimsList = ::google::protobuf::RepeatedField<::google::protobuf::int64>;
+
+/// The type for the metric_tags map.
+using MetricTagsMap = ::google::protobuf::Map<std::string, std::string>;
+
+// Map from a host policy name to <setting, value> map of cmdline
+// settings for the host policy.
+using HostPolicyCmdlineConfig = std::map<std::string, std::string>;
+using HostPolicyCmdlineConfigMap =
+    std::unordered_map<std::string, HostPolicyCmdlineConfig>;
+
+// Map from backend name to list of setting=value pairs of cmdline
+// settings for the backend.
+using BackendCmdlineConfig = std::vector<std::pair<std::string, std::string>>;
+using BackendCmdlineConfigMap =
+    std::unordered_map<std::string, BackendCmdlineConfig>;
+
+/// The value for a dimension in a shape that indicates that that
+/// dimension can take on any size.
+constexpr int WILDCARD_DIM = -1;
+
+constexpr int SCHEDULER_DEFAULT_NICE = 5;
+
+/// Enumeration for the different platform types.
+enum Platform {
+  PLATFORM_UNKNOWN = 0,
+  PLATFORM_TENSORRT_PLAN = 1,
+  PLATFORM_TENSORFLOW_GRAPHDEF = 2,
+  PLATFORM_TENSORFLOW_SAVEDMODEL = 3,
+  PLATFORM_ENSEMBLE = 4,
+  PLATFORM_ONNXRUNTIME_ONNX = 5,
+  PLATFORM_PYTORCH_LIBTORCH = 6
+};
+
+/// Get the number of elements in a shape.
+/// \param dims The shape.
+/// \return The number of elements, or -1 if the number of elements
+/// cannot be determined because the shape contains one or more
+/// wilcard dimensions.
+int64_t GetElementCount(const DimsList& dims);
+
+/// Get the number of elements in a shape.
+/// \param dims The shape.
+/// \return The number of elements, or -1 if the number of elements
+/// cannot be determined because the shape contains one or more
+/// wilcard dimensions.
+int64_t GetElementCount(const std::vector<int64_t>& dims);
+
+/// Get the number of elements in the shape of a model input.
+/// \param mio The model input.
+/// \return The number of elements, or -1 if the number of elements
+/// cannot be determined because the shape contains one or more
+/// wilcard dimensions.
+int64_t GetElementCount(const inference::ModelInput& mio);
+
+/// Get the number of elements in the shape of a model output.
+/// \param mio The model output.
+/// \return The number of elements, or -1 if the number of elements
+/// cannot be determined because the shape contains one or more
+/// wilcard dimensions.
+int64_t GetElementCount(const inference::ModelOutput& mio);
+
+/// Are values of a datatype fixed-size, or variable-sized.
+/// \param dtype The data-type.
+/// \return True if datatype values are fixed-sized, false if
+/// variable-sized.
+bool IsFixedSizeDataType(const inference::DataType dtype);
+
+/// Get the size of objects of a given datatype in bytes.
+/// \param dtype The data-type.
+/// \return The size, in bytes, of objects of the datatype, or 0 if
+/// size cannot be determine (for example, values of type TYPE_STRING
+/// have variable length and so size cannot be determine just from the
+/// type).
+size_t GetDataTypeByteSize(const inference::DataType dtype);
+
+/// Get the size, in bytes, of a tensor based on datatype and
+/// shape.
+/// \param dtype The data-type.
+/// \param dims The shape.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(const inference::DataType& dtype, const DimsList& dims);
+
+/// Get the size, in bytes, of a tensor based on datatype and
+/// shape.
+/// \param dtype The data-type.
+/// \param dims The shape.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(
+    const inference::DataType& dtype, const std::vector<int64_t>& dims);
+
+/// Get the size, in bytes, of a tensor based on batch-size, datatype
+/// and shape. A tensor that has empty shape [] and non-zero
+/// batch-size is sized as a tensor with shape [ batch-size ].
+/// \param batch_size The batch-size. May be 0 to indicate no
+/// batching.
+/// \param dtype The data-type.
+/// \param dims The shape.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(
+    const int batch_size, const inference::DataType& dtype,
+    const DimsList& dims);
+
+/// Get the size, in bytes, of a tensor based on batch-size, datatype
+/// and shape. A tensor that has empty shape [] and non-zero
+/// batch-size is sized as a tensor with shape [ batch-size ].
+/// \param batch_size The batch-size. May be 0 to indicate no
+/// batching.
+/// \param dtype The data-type.
+/// \param dims The shape.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(
+    const int batch_size, const inference::DataType& dtype,
+    const std::vector<int64_t>& dims);
+
+/// Get the size, in bytes, of a tensor based on ModelInput.
+/// \param mio The ModelInput protobuf.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(const inference::ModelInput& mio);
+
+/// Get the size, in bytes, of a tensor based on ModelOutput.
+/// \param mio The ModelOutput protobuf.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(const inference::ModelOutput& mio);
+
+/// Get the CPU thread nice level associate with a model
+/// configuration's priority.
+/// \param config The model configuration.
+/// \return The nice level.
+int GetCpuNiceLevel(const inference::ModelConfig& config);
+
+/// Compare two model configuration shapes for equality. Wildcard
+/// dimensions (that is, dimensions with size WILDCARD_DIM) are
+/// compared literally so that to be equal the two shapes must both
+/// specify WILDCARD_DIM in the same dimensions.
+/// \params dims0 The first shape.
+/// \params dims1 The second shape.
+/// \return True if the shapes are equal, false if not equal.
+bool CompareDims(const DimsList& dims0, const DimsList& dims1);
+
+/// Compare two model configuration shapes for equality. Wildcard
+/// dimensions (that is, dimensions with size WILDCARD_DIM) are
+/// compared literally so that to be equal the two shapes must both
+/// specify WILDCARD_DIM in the same dimensions.
+/// \params dims0 The first shape.
+/// \params dims1 The second shape.
+/// \return True if the shapes are equal, false if not equal.
+bool CompareDims(
+    const std::vector<int64_t>& dims0, const std::vector<int64_t>& dims1);
+
+/// Compare two model configuration shapes for equality. Wildcard
+/// dimensions (that is, dimensions with size WILDCARD_DIM) are
+/// allowed to match with any value. So, a dimension in one shape
+/// specified as WILDCARD_DIM will always match the same dimension in
+/// the other shape.
+/// \params dims0 The first shape.
+/// \params dims1 The second shape.
+/// \return True if the shapes are equal, false if not equal.
+bool CompareDimsWithWildcard(const DimsList& dims0, const DimsList& dims1);
+
+/// Compare two model configuration shapes for equality. Wildcard
+/// dimensions (that is, dimensions with size WILDCARD_DIM) are
+/// allowed to match with any value. So, a dimension in one shape
+/// specified as WILDCARD_DIM will always match the same dimension in
+/// the other shape.
+/// \params dims0 The first shape.
+/// \params dims1 The second shape.
+/// \return True if the shapes are equal, false if not equal.
+bool CompareDimsWithWildcard(
+    const DimsList& dims0, const std::vector<int64_t>& dims1);
+
+/// Convert a DimsList to string representation.
+/// \param dims The DimsList to be converted.
+/// \return String representation of the DimsList in pattern
+/// "[d0,d1,...,dn]"
+std::string DimsListToString(const DimsList& dims);
+
+/// Convert a vector representing a shape to string representation.
+/// \param dims The vector of dimensions to be converted.
+/// \return String representation of the vector in pattern
+/// "[d0,d1,...,dn]"
+std::string DimsListToString(
+    const std::vector<int64_t>& dims, const int start_idx = 0);
+
+/// Get the server protocol string representation of a datatype.
+/// \param dtype The data type.
+/// \return The string representation.
+const char* DataTypeToProtocolString(const inference::DataType dtype);
+
+/// Get the datatype corresponding to a server protocol string
+/// representation of a datatype.
+/// \param dtype string representation.
+/// \return The data type.
+inference::DataType ProtocolStringToDataType(const std::string& dtype);
+
+/// Get the datatype corresponding to a server protocol string
+/// representation of a datatype.
+/// \param dtype Pointer to string.
+/// \param len Length of the string.
+/// \return The data type.
+inference::DataType ProtocolStringToDataType(const char* dtype, size_t len);
+
+}}  // namespace triton::common