添加下载的代码

b30f3cdb · xiabo · e38ee081 · b30f3cdb · b30f3cdb · b30f3cdb
Commit b30f3cdb authored Nov 14, 2023 by xiabo
20 changed files
--- a/3rdparty/backend-r22.12/examples/backends/recommended/src/recommended.cc
+++ b/3rdparty/backend-r22.12/examples/backends/recommended/src/recommended.cc
+// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/core/tritonbackend.h"
+
+namespace triton { namespace backend { namespace recommended {
+
+//
+// Backend that demonstrates the TRITONBACKEND API. This backend works
+// for any model that has 1 input with any datatype and any shape and
+// 1 output with the same shape and datatype as the input. The backend
+// supports both batching and non-batching models.
+//
+// For each batch of requests, the backend returns the input tensor
+// value in the output tensor.
+//
+
+/////////////
+
+extern "C" {
+
+// Triton calls TRITONBACKEND_Initialize when a backend is loaded into
+// Triton to allow the backend to create and initialize any state that
+// is intended to be shared across all models and model instances that
+// use the backend. The backend should also verify version
+// compatibility with Triton in this function.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+  const char* cname;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+  std::string name(cname);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
+
+  // Check the backend API version that Triton supports vs. what this
+  // backend was compiled against. Make sure that the Triton major
+  // version is the same and the minor version is >= what this backend
+  // uses.
+  uint32_t api_version_major, api_version_minor;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("Triton TRITONBACKEND API version: ") +
+       std::to_string(api_version_major) + "." +
+       std::to_string(api_version_minor))
+          .c_str());
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("'") + name + "' TRITONBACKEND API version: " +
+       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
+       std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+          .c_str());
+
+  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
+      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_UNSUPPORTED,
+        "triton backend API version does not support this backend");
+  }
+
+  // The backend configuration may contain information needed by the
+  // backend, such as tritonserver command-line arguments. This
+  // backend doesn't use any such configuration but for this example
+  // print whatever is available.
+  TRITONSERVER_Message* backend_config_message;
+  RETURN_IF_ERROR(
+      TRITONBACKEND_BackendConfig(backend, &backend_config_message));
+
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
+      backend_config_message, &buffer, &byte_size));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("backend configuration:\n") + buffer).c_str());
+
+  // This backend does not require any "global" state but as an
+  // example create a string to demonstrate.
+  std::string* state = new std::string("backend state");
+  RETURN_IF_ERROR(
+      TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(state)));
+
+  return nullptr;  // success
+}
+
+// Triton calls TRITONBACKEND_Finalize when a backend is no longer
+// needed.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
+{
+  // Delete the "global" state associated with the backend.
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));
+  std::string* state = reinterpret_cast<std::string*>(vstate);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("TRITONBACKEND_Finalize: state is '") + *state + "'")
+          .c_str());
+
+  delete state;
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+/////////////
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model. ModelState is derived from BackendModel class
+// provided in the backend utilities that provides many common
+// functions.
+//
+class ModelState : public BackendModel {
+ public:
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_Model* triton_model, ModelState** state);
+  virtual ~ModelState() = default;
+
+  // Name of the input and output tensor
+  const std::string& InputTensorName() const { return input_name_; }
+  const std::string& OutputTensorName() const { return output_name_; }
+
+  // Datatype of the input and output tensor
+  TRITONSERVER_DataType TensorDataType() const { return datatype_; }
+
+  // Shape of the input and output tensor as given in the model
+  // configuration file. This shape will not include the batch
+  // dimension (if the model has one).
+  const std::vector<int64_t>& TensorNonBatchShape() const { return nb_shape_; }
+
+  // Shape of the input and output tensor, including the batch
+  // dimension (if the model has one). This method cannot be called
+  // until the model is completely loaded and initialized, including
+  // all instances of the model. In practice, this means that backend
+  // should only call it in TRITONBACKEND_ModelInstanceExecute.
+  TRITONSERVER_Error* TensorShape(std::vector<int64_t>& shape);
+
+  // Validate that this model is supported by this backend.
+  TRITONSERVER_Error* ValidateModelConfig();
+
+ private:
+  ModelState(TRITONBACKEND_Model* triton_model);
+
+  std::string input_name_;
+  std::string output_name_;
+
+  TRITONSERVER_DataType datatype_;
+
+  bool shape_initialized_;
+  std::vector<int64_t> nb_shape_;
+  std::vector<int64_t> shape_;
+};
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model)
+    : BackendModel(triton_model), shape_initialized_(false)
+{
+  // Validate that the model's configuration matches what is supported
+  // by this backend.
+  THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig());
+}
+
+TRITONSERVER_Error*
+ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+  try {
+    *state = new ModelState(triton_model);
+  }
+  catch (const BackendModelException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::TensorShape(std::vector<int64_t>& shape)
+{
+  // This backend supports models that batch along the first dimension
+  // and those that don't batch. For non-batch models the output shape
+  // will be the shape from the model configuration. For batch models
+  // the output shape will be the shape from the model configuration
+  // prepended with [ -1 ] to represent the batch dimension. The
+  // backend "responder" utility used below will set the appropriate
+  // batch dimension value for each response. The shape needs to be
+  // initialized lazily because the SupportsFirstDimBatching function
+  // cannot be used until the model is completely loaded.
+  if (!shape_initialized_) {
+    bool supports_first_dim_batching;
+    RETURN_IF_ERROR(SupportsFirstDimBatching(&supports_first_dim_batching));
+    if (supports_first_dim_batching) {
+      shape_.push_back(-1);
+    }
+
+    shape_.insert(shape_.end(), nb_shape_.begin(), nb_shape_.end());
+    shape_initialized_ = true;
+  }
+
+  shape = shape_;
+
+  return nullptr;  // success
+}
+
+TRITONSERVER_Error*
+ModelState::ValidateModelConfig()
+{
+  // If verbose logging is enabled, dump the model's configuration as
+  // JSON into the console output.
+  if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+    common::TritonJson::WriteBuffer buffer;
+    RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("model configuration:\n") + buffer.Contents()).c_str());
+  }
+
+  // ModelConfig is the model configuration as a TritonJson
+  // object. Use the TritonJson utilities to parse the JSON and
+  // determine if the configuration is supported by this backend.
+  common::TritonJson::Value inputs, outputs;
+  RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &inputs));
+  RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &outputs));
+
+  // The model must have exactly 1 input and 1 output.
+  RETURN_ERROR_IF_FALSE(
+      inputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("model configuration must have 1 input"));
+  RETURN_ERROR_IF_FALSE(
+      outputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("model configuration must have 1 output"));
+
+  common::TritonJson::Value input, output;
+  RETURN_IF_ERROR(inputs.IndexAsObject(0, &input));
+  RETURN_IF_ERROR(outputs.IndexAsObject(0, &output));
+
+  // Record the input and output name in the model state.
+  const char* input_name;
+  size_t input_name_len;
+  RETURN_IF_ERROR(input.MemberAsString("name", &input_name, &input_name_len));
+  input_name_ = std::string(input_name);
+
+  const char* output_name;
+  size_t output_name_len;
+  RETURN_IF_ERROR(
+      output.MemberAsString("name", &output_name, &output_name_len));
+  output_name_ = std::string(output_name);
+
+  // Input and output must have same datatype
+  std::string input_dtype, output_dtype;
+  RETURN_IF_ERROR(input.MemberAsString("data_type", &input_dtype));
+  RETURN_IF_ERROR(output.MemberAsString("data_type", &output_dtype));
+  RETURN_ERROR_IF_FALSE(
+      input_dtype == output_dtype, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("expected input and output datatype to match, got ") +
+          input_dtype + " and " + output_dtype);
+  datatype_ = ModelConfigDataTypeToTritonServerDataType(input_dtype);
+
+  // Input and output must have same shape. Reshape is not supported
+  // on either input or output so flag an error is the model
+  // configuration uses it.
+  triton::common::TritonJson::Value reshape;
+  RETURN_ERROR_IF_TRUE(
+      input.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
+      std::string("reshape not supported for input tensor"));
+  RETURN_ERROR_IF_TRUE(
+      output.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
+      std::string("reshape not supported for output tensor"));
+
+  std::vector<int64_t> input_shape, output_shape;
+  RETURN_IF_ERROR(backend::ParseShape(input, "dims", &input_shape));
+  RETURN_IF_ERROR(backend::ParseShape(output, "dims", &output_shape));
+
+  RETURN_ERROR_IF_FALSE(
+      input_shape == output_shape, TRITONSERVER_ERROR_INVALID_ARG,
+      std::string("expected input and output shape to match, got ") +
+          backend::ShapeToString(input_shape) + " and " +
+          backend::ShapeToString(output_shape));
+
+  nb_shape_ = input_shape;
+
+  return nullptr;  // success
+}
+
+extern "C" {
+
+// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
+// to allow the backend to create any state associated with the model,
+// and to also examine the model configuration to determine if the
+// configuration is suitable for the backend. Any errors reported by
+// this function will prevent the model from loading.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+  // Create a ModelState object and associate it with the
+  // TRITONBACKEND_Model. If anything goes wrong with initialization
+  // of the model state then an error is returned and Triton will fail
+  // to load the model.
+  ModelState* model_state;
+  RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+  RETURN_IF_ERROR(
+      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+  return nullptr;  // success
+}
+
+// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
+// needed. The backend should cleanup any state associated with the
+// model. This function will not be called until all model instances
+// of the model have been finalized.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+  delete model_state;
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+/////////////
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each
+// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
+// BackendModelInstance class provided in the backend utilities that
+// provides many common functions.
+//
+class ModelInstanceState : public BackendModelInstance {
+ public:
+  static TRITONSERVER_Error* Create(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance,
+      ModelInstanceState** state);
+  virtual ~ModelInstanceState() = default;
+
+  // Get the state of the model that corresponds to this instance.
+  ModelState* StateForModel() const { return model_state_; }
+
+ private:
+  ModelInstanceState(
+      ModelState* model_state,
+      TRITONBACKEND_ModelInstance* triton_model_instance)
+      : BackendModelInstance(model_state, triton_model_instance),
+        model_state_(model_state)
+  {
+  }
+
+  ModelState* model_state_;
+};
+
+TRITONSERVER_Error*
+ModelInstanceState::Create(
+    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
+    ModelInstanceState** state)
+{
+  try {
+    *state = new ModelInstanceState(model_state, triton_model_instance);
+  }
+  catch (const BackendModelInstanceException& ex) {
+    RETURN_ERROR_IF_TRUE(
+        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
+        std::string("unexpected nullptr in BackendModelInstanceException"));
+    RETURN_IF_ERROR(ex.err_);
+  }
+
+  return nullptr;  // success
+}
+
+extern "C" {
+
+// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
+// instance is created to allow the backend to initialize any state
+// associated with the instance.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+  // Get the model state associated with this instance's model.
+  TRITONBACKEND_Model* model;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+  void* vmodelstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+  // Create a ModelInstanceState object and associate it with the
+  // TRITONBACKEND_ModelInstance.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(
+      ModelInstanceState::Create(model_state, instance, &instance_state));
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
+      instance, reinterpret_cast<void*>(instance_state)));
+
+  return nullptr;  // success
+}
+
+// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
+// instance is no longer needed. The backend should cleanup any state
+// associated with the model instance.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+  void* vstate;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+  ModelInstanceState* instance_state =
+      reinterpret_cast<ModelInstanceState*>(vstate);
+  delete instance_state;
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+/////////////
+
+extern "C" {
+
+// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
+// that a backend create a response for each request in the batch. A
+// response may be the output tensors required for that request or may
+// be an error that is returned in the response.
+//
+TRITONSERVER_Error*
+TRITONBACKEND_ModelInstanceExecute(
+    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
+    const uint32_t request_count)
+{
+  // Collect various timestamps during the execution of this batch or
+  // requests. These values are reported below before returning from
+  // the function.
+
+  uint64_t exec_start_ns = 0;
+  SET_TIMESTAMP(exec_start_ns);
+
+  // Triton will not call this function simultaneously for the same
+  // 'instance'. But since this backend could be used by multiple
+  // instances from multiple models the implementation needs to handle
+  // multiple calls to this function at the same time (with different
+  // 'instance' objects). Best practice for a high-performance
+  // implementation is to avoid introducing mutex/lock and instead use
+  // only function-local and model-instance-specific state.
+  ModelInstanceState* instance_state;
+  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
+      instance, reinterpret_cast<void**>(&instance_state)));
+  ModelState* model_state = instance_state->StateForModel();
+
+  // 'responses' is initialized as a parallel array to 'requests',
+  // with one TRITONBACKEND_Response object for each
+  // TRITONBACKEND_Request object. If something goes wrong while
+  // creating these response objects, the backend simply returns an
+  // error from TRITONBACKEND_ModelInstanceExecute, indicating to
+  // Triton that this backend did not create or send any responses and
+  // so it is up to Triton to create and send an appropriate error
+  // response for each request. RETURN_IF_ERROR is one of several
+  // useful macros for error handling that can be found in
+  // backend_common.h.
+
+  std::vector<TRITONBACKEND_Response*> responses;
+  responses.reserve(request_count);
+  for (uint32_t r = 0; r < request_count; ++r) {
+    TRITONBACKEND_Request* request = requests[r];
+    TRITONBACKEND_Response* response;
+    RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
+    responses.push_back(response);
+  }
+
+  // At this point, the backend takes ownership of 'requests', which
+  // means that it is responsible for sending a response for every
+  // request. From here, even if something goes wrong in processing,
+  // the backend must return 'nullptr' from this function to indicate
+  // success. Any errors and failures must be communicated via the
+  // response objects.
+  //
+  // To simplify error handling, the backend utilities manage
+  // 'responses' in a specific way and it is recommended that backends
+  // follow this same pattern. When an error is detected in the
+  // processing of a request, an appropriate error response is sent
+  // and the corresponding TRITONBACKEND_Response object within
+  // 'responses' is set to nullptr to indicate that the
+  // request/response has already been handled and no futher processing
+  // should be performed for that request. Even if all responses fail,
+  // the backend still allows execution to flow to the end of the
+  // function so that statistics are correctly reported by the calls
+  // to TRITONBACKEND_ModelInstanceReportStatistics and
+  // TRITONBACKEND_ModelInstanceReportBatchStatistics.
+  // RESPOND_AND_SET_NULL_IF_ERROR, and
+  // RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from
+  // backend_common.h that assist in this management of response
+  // objects.
+
+  // The backend could iterate over the 'requests' and process each
+  // one separately. But for performance reasons it is usually
+  // preferred to create batched input tensors that are processed
+  // simultaneously. This is especially true for devices like GPUs
+  // that are capable of exploiting the large amount parallelism
+  // exposed by larger data sets.
+  //
+  // The backend utilities provide a "collector" to facilitate this
+  // batching process. The 'collector's ProcessTensor function will
+  // combine a tensor's value from each request in the batch into a
+  // single contiguous buffer. The buffer can be provided by the
+  // backend or 'collector' can create and manage it. In this backend,
+  // there is not a specific buffer into which the batch should be
+  // created, so use ProcessTensor arguments that cause collector to
+  // manage it. ProcessTensor does NOT support TRITONSERVER_TYPE_BYTES
+  // data type.
+
+  BackendInputCollector collector(
+      requests, request_count, &responses, model_state->TritonMemoryManager(),
+      false /* pinned_enabled */, nullptr /* stream*/);
+
+  // To instruct ProcessTensor to "gather" the entire batch of input
+  // tensors into a single contiguous buffer in CPU memory, set the
+  // "allowed input types" to be the CPU ones (see tritonserver.h in
+  // the triton-inference-server/core repo for allowed memory types).
+  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =
+      {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
+
+  const char* input_buffer;
+  size_t input_buffer_byte_size;
+  TRITONSERVER_MemoryType input_buffer_memory_type;
+  int64_t input_buffer_memory_type_id;
+
+  RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+      responses, request_count,
+      collector.ProcessTensor(
+          model_state->InputTensorName().c_str(), nullptr /* existing_buffer */,
+          0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,
+          &input_buffer_byte_size, &input_buffer_memory_type,
+          &input_buffer_memory_type_id));
+
+  // Finalize the collector. If 'true' is returned, 'input_buffer'
+  // will not be valid until the backend synchronizes the CUDA
+  // stream or event that was used when creating the collector. For
+  // this backend, GPU is not supported and so no CUDA sync should
+  // be needed; so if 'true' is returned simply log an error.
+  const bool need_cuda_input_sync = collector.Finalize();
+  if (need_cuda_input_sync) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR,
+        "'recommended' backend: unexpected CUDA sync required by collector");
+  }
+
+  // 'input_buffer' contains the batched input tensor. The backend can
+  // implement whatever logic is necessary to produce the output
+  // tensor. This backend simply logs the input tensor value and then
+  // returns the input tensor value in the output tensor so no actual
+  // computation is needed.
+
+  uint64_t compute_start_ns = 0;
+  SET_TIMESTAMP(compute_start_ns);
+
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("model ") + model_state->Name() + ": requests in batch " +
+       std::to_string(request_count))
+          .c_str());
+  std::string tstr;
+  IGNORE_ERROR(BufferAsTypedString(
+      tstr, input_buffer, input_buffer_byte_size,
+      model_state->TensorDataType()));
+  LOG_MESSAGE(
+      TRITONSERVER_LOG_INFO,
+      (std::string("batched " + model_state->InputTensorName() + " value: ") +
+       tstr)
+          .c_str());
+
+  const char* output_buffer = input_buffer;
+  TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;
+  int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;
+
+  uint64_t compute_end_ns = 0;
+  SET_TIMESTAMP(compute_end_ns);
+
+  bool supports_first_dim_batching;
+  RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+      responses, request_count,
+      model_state->SupportsFirstDimBatching(&supports_first_dim_batching));
+
+  std::vector<int64_t> tensor_shape;
+  RESPOND_ALL_AND_SET_NULL_IF_ERROR(
+      responses, request_count, model_state->TensorShape(tensor_shape));
+
+  // Because the output tensor values are concatenated into a single
+  // contiguous 'output_buffer', the backend must "scatter" them out
+  // to the individual response output tensors.  The backend utilities
+  // provide a "responder" to facilitate this scattering process.
+  // BackendOutputResponder does NOT support TRITONSERVER_TYPE_BYTES
+  // data type.
+
+  // The 'responders's ProcessTensor function will copy the portion of
+  // 'output_buffer' corresonding to each request's output into the
+  // response for that request.
+
+  BackendOutputResponder responder(
+      requests, request_count, &responses, model_state->TritonMemoryManager(),
+      supports_first_dim_batching, false /* pinned_enabled */,
+      nullptr /* stream*/);
+
+  responder.ProcessTensor(
+      model_state->OutputTensorName().c_str(), model_state->TensorDataType(),
+      tensor_shape, output_buffer, output_buffer_memory_type,
+      output_buffer_memory_type_id);
+
+  // Finalize the responder. If 'true' is returned, the output
+  // tensors' data will not be valid until the backend synchronizes
+  // the CUDA stream or event that was used when creating the
+  // responder. For this backend, GPU is not supported and so no CUDA
+  // sync should be needed; so if 'true' is returned simply log an
+  // error.
+  const bool need_cuda_output_sync = responder.Finalize();
+  if (need_cuda_output_sync) {
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_ERROR,
+        "'recommended' backend: unexpected CUDA sync required by responder");
+  }
+
+  // Send all the responses that haven't already been sent because of
+  // an earlier error.
+  for (auto& response : responses) {
+    if (response != nullptr) {
+      LOG_IF_ERROR(
+          TRITONBACKEND_ResponseSend(
+              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+          "failed to send response");
+    }
+  }
+
+  uint64_t exec_end_ns = 0;
+  SET_TIMESTAMP(exec_end_ns);
+
+#ifdef TRITON_ENABLE_STATS
+  // For batch statistics need to know the total batch size of the
+  // requests. This is not necessarily just the number of requests,
+  // because if the model supports batching then any request can be a
+  // batched request itself.
+  size_t total_batch_size = 0;
+  if (!supports_first_dim_batching) {
+    total_batch_size = request_count;
+  } else {
+    for (uint32_t r = 0; r < request_count; ++r) {
+      auto& request = requests[r];
+      TRITONBACKEND_Input* input = nullptr;
+      LOG_IF_ERROR(
+          TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input),
+          "failed getting request input");
+      if (input != nullptr) {
+        const int64_t* shape = nullptr;
+        LOG_IF_ERROR(
+            TRITONBACKEND_InputProperties(
+                input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr),
+            "failed getting input properties");
+        if (shape != nullptr) {
+          total_batch_size += shape[0];
+        }
+      }
+    }
+  }
+#else
+  (void)exec_start_ns;
+  (void)exec_end_ns;
+  (void)compute_start_ns;
+  (void)compute_end_ns;
+#endif  // TRITON_ENABLE_STATS
+
+  // Report statistics for each request, and then release the request.
+  for (uint32_t r = 0; r < request_count; ++r) {
+    auto& request = requests[r];
+
+#ifdef TRITON_ENABLE_STATS
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportStatistics(
+            instance_state->TritonModelInstance(), request,
+            (responses[r] != nullptr) /* success */, exec_start_ns,
+            compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting request statistics");
+#endif  // TRITON_ENABLE_STATS
+
+    LOG_IF_ERROR(
+        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+        "failed releasing request");
+  }
+
+#ifdef TRITON_ENABLE_STATS
+  // Report batch statistics.
+  LOG_IF_ERROR(
+      TRITONBACKEND_ModelInstanceReportBatchStatistics(
+          instance_state->TritonModelInstance(), total_batch_size,
+          exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
+      "failed reporting batch request statistics");
+#endif  // TRITON_ENABLE_STATS
+
+  return nullptr;  // success
+}
+
+}  // extern "C"
+
+}}}  // namespace triton::backend::recommended
--- a/3rdparty/backend-r22.12/examples/clients/bls_client
+++ b/3rdparty/backend-r22.12/examples/clients/bls_client
+#!/usr/bin/python
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+import argparse
+import numpy as np
+import tritonhttpclient as httpclient
+from tritonclientutils import np_to_triton_dtype
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-u',
+                        '--url',
+                        type=str,
+                        required=False,
+                        default='localhost:8000',
+                        help='Inference server URL. Default is localhost:8000.')
+    FLAGS = parser.parse_args()
+
+    model_name = "bls_fp32"
+    shape = [16]
+    with httpclient.InferenceServerClient(url=FLAGS.url) as client:
+        input0_data = np.random.rand(*shape).astype(np.float32)
+        input1_data = np.random.rand(*shape).astype(np.float32)
+        inputs = [
+            httpclient.InferInput("INPUT0", input0_data.shape,
+                                    np_to_triton_dtype(input0_data.dtype)),
+            httpclient.InferInput("INPUT1", input1_data.shape,
+                                    np_to_triton_dtype(input1_data.dtype)),
+        ]
+
+        inputs[0].set_data_from_numpy(input0_data)
+        inputs[1].set_data_from_numpy(input1_data)
+
+        outputs = [
+            httpclient.InferRequestedOutput("OUTPUT0"),
+            httpclient.InferRequestedOutput("OUTPUT1"),
+        ]
+        response = client.infer(model_name,
+                                inputs,
+                                request_id=str(1),
+                                outputs=outputs)
+
+        result = response.get_response()
+        output0_data = response.as_numpy("OUTPUT0")
+        output1_data = response.as_numpy("OUTPUT1")
+
+        print("INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})".format(
+            input0_data, input1_data, output0_data))
+        print("INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})".format(
+            input0_data, input1_data, output1_data))
+
+        if not np.allclose(input0_data + input1_data, output0_data):
+            print("error: incorrect sum")
+            sys.exit(1)
+
+        if not np.allclose(input0_data - input1_data, output1_data):
+            print("error: incorrect difference")
+            sys.exit(1)
+
+    print('\nPASS')
+    sys.exit(0)
--- a/3rdparty/backend-r22.12/examples/clients/minimal_client
+++ b/3rdparty/backend-r22.12/examples/clients/minimal_client
+#!/usr/bin/env python
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import numpy as np
+
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-u',
+                        '--url',
+                        type=str,
+                        required=False,
+                        default='localhost:8000',
+                        help='Inference server URL. Default is localhost:8000.')
+    FLAGS = parser.parse_args()
+
+    # For the HTTP client, need to specify large enough concurrency to
+    # issue all the inference requests to the server in parallel. For
+    # this example we want to be able to send 2 requests concurrently.
+    try:
+        concurrent_request_count = 2
+        triton_client = httpclient.InferenceServerClient(
+            url=FLAGS.url, concurrency=concurrent_request_count)
+    except Exception as e:
+        print("channel creation failed: " + str(e))
+        sys.exit(1)
+
+    # First send a single request to the nonbatching model.
+    print('=========')
+    input0_data = np.array([ 1, 2, 3, 4 ], dtype=np.int32)
+    print('Sending request to nonbatching model: IN0 = {}'.format(input0_data))
+
+    inputs = [ httpclient.InferInput('IN0', [4], "INT32") ]
+    inputs[0].set_data_from_numpy(input0_data)
+    result = triton_client.infer('nonbatching', inputs)
+
+    print('Response: {}'.format(result.get_response()))
+    print('OUT0 = {}'.format(result.as_numpy('OUT0')))
+
+    # Send 2 requests to the batching model. Because these are sent
+    # asynchronously and Triton's dynamic batcher is configured to
+    # delay up to 5 seconds when forming a batch for this model, we
+    # expect these 2 requests to be batched within Triton and sent to
+    # the minimal backend as a single batch.
+    print('\n=========')
+    async_requests = []
+
+    input0_data = np.array([[ 10, 11, 12, 13 ]], dtype=np.int32)
+    print('Sending request to batching model: IN0 = {}'.format(input0_data))
+    inputs = [ httpclient.InferInput('IN0', [1, 4], "INT32") ]
+    inputs[0].set_data_from_numpy(input0_data)
+    async_requests.append(triton_client.async_infer('batching', inputs))
+
+    input0_data = np.array([[ 20, 21, 22, 23 ]], dtype=np.int32)
+    print('Sending request to batching model: IN0 = {}'.format(input0_data))
+    inputs = [ httpclient.InferInput('IN0', [1, 4], "INT32") ]
+    inputs[0].set_data_from_numpy(input0_data)
+    async_requests.append(triton_client.async_infer('batching', inputs))
+
+    for async_request in async_requests:
+        # Get the result from the initiated asynchronous inference
+        # request. This call will block till the server responds.
+        result = async_request.get_result()
+        print('Response: {}'.format(result.get_response()))
+        print('OUT0 = {}'.format(result.as_numpy('OUT0')))
--- a/3rdparty/backend-r22.12/examples/clients/recommended_client
+++ b/3rdparty/backend-r22.12/examples/clients/recommended_client
+#!/usr/bin/env python
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import numpy as np
+
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-u',
+                        '--url',
+                        type=str,
+                        required=False,
+                        default='localhost:8000',
+                        help='Inference server URL. Default is localhost:8000.')
+    FLAGS = parser.parse_args()
+
+    # For the HTTP client, need to specify large enough concurrency to
+    # issue all the inference requests to the server in parallel. For
+    # this example we want to be able to send 2 requests concurrently.
+    try:
+        concurrent_request_count = 2
+        triton_client = httpclient.InferenceServerClient(
+            url=FLAGS.url, concurrency=concurrent_request_count)
+    except Exception as e:
+        print("channel creation failed: " + str(e))
+        sys.exit(1)
+
+    # Send 2 requests to the batching model. Because these are sent
+    # asynchronously and Triton's dynamic batcher is configured to
+    # delay up to 5 seconds when forming a batch for this model, we
+    # expect these 2 requests to be batched within Triton and sent to
+    # the backend as a single batch.
+    #
+    # The recommended backend can handle any model with 1 input and 1
+    # output as long as the input and output datatype and shape are
+    # the same. The batching model uses datatype FP32 and shape
+    # [ 4, 4 ].
+    print('\n=========')
+    async_requests = []
+
+    input0_data = np.array([[[ 1.0, 1.1, 1.2, 1.3 ],
+                             [ 2.0, 2.1, 2.2, 2.3 ],
+                             [ 3.0, 3.1, 3.2, 3.3 ],
+                             [ 4.0, 4.1, 4.2, 4.3 ]]], dtype=np.float32)
+    print('Sending request to batching model: input = {}'.format(input0_data))
+    inputs = [ httpclient.InferInput('INPUT', [1, 4, 4], "FP32") ]
+    inputs[0].set_data_from_numpy(input0_data)
+    async_requests.append(triton_client.async_infer('batching', inputs))
+
+    input0_data = np.array([[[ 10.0, 10.1, 10.2, 10.3 ],
+                             [ 20.0, 20.1, 20.2, 20.3 ],
+                             [ 30.0, 30.1, 30.2, 30.3 ],
+                             [ 40.0, 40.1, 40.2, 40.3 ]]], dtype=np.float32)
+    print('Sending request to batching model: input = {}'.format(input0_data))
+    inputs = [ httpclient.InferInput('INPUT', [1, 4, 4], "FP32") ]
+    inputs[0].set_data_from_numpy(input0_data)
+    async_requests.append(triton_client.async_infer('batching', inputs))
+
+    for async_request in async_requests:
+        # Get the result from the initiated asynchronous inference
+        # request. This call will block till the server responds.
+        result = async_request.get_result()
+        print('Response: {}'.format(result.get_response()))
+        print('OUTPUT = {}'.format(result.as_numpy('OUTPUT')))
--- a/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_python/1/model.py
+++ b/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_python/1/model.py
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import triton_python_backend_utils as pb_utils
+
+# This model calculates the sum and difference of the INPUT0 and INPUT1 and put
+# the results in OUTPUT0 and OUTPUT1 respectively. For more information
+# regarding how this model.py was written, please refer to Python Backend.
+class TritonPythonModel:
+
+    def initialize(self, args):
+        self.model_config = model_config = json.loads(args['model_config'])
+
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT0")
+
+        output1_config = pb_utils.get_output_config_by_name(
+            model_config, "OUTPUT1")
+
+        self.output0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        self.output1_dtype = pb_utils.triton_string_to_numpy(
+            output1_config['data_type'])
+
+    def execute(self, requests):
+        output0_dtype = self.output0_dtype
+        output1_dtype = self.output1_dtype
+
+        responses = []
+
+        for request in requests:
+            in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            in_1 = pb_utils.get_input_tensor_by_name(request, "INPUT1")
+
+            out_0, out_1 = (in_0.as_numpy() + in_1.as_numpy(),
+                            in_0.as_numpy() - in_1.as_numpy())
+
+            out_tensor_0 = pb_utils.Tensor("OUTPUT0",
+                                           out_0.astype(output0_dtype))
+            out_tensor_1 = pb_utils.Tensor("OUTPUT1",
+                                           out_1.astype(output1_dtype))
+
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[out_tensor_0, out_tensor_1])
+            responses.append(inference_response)
+
+        return responses
+
+    def finalize(self):
+        print('Cleaning up...')
--- a/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_python/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_python/config.pbtxt
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "addsub_python"
+backend: "python"
+max_batch_size: 0
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
--- a/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_tf/1/model.savedmodel/saved_model.pb
+++ b/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_tf/1/model.savedmodel/saved_model.pb
--- a/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_tf/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/bls_models/addsub_tf/config.pbtxt
+name: "addsub_tf"
+platform: "tensorflow_savedmodel"
+max_batch_size: 0
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  },
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  },
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
--- a/3rdparty/backend-r22.12/examples/model_repos/bls_models/bls_fp32/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/bls_models/bls_fp32/config.pbtxt
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "bls_fp32"
+backend: "bls"
+max_batch_size: 0
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+input [
+  {
+    name: "INPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT1"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+instance_group [
+  {
+    kind: KIND_CPU
+  }
+]
--- a/3rdparty/backend-r22.12/examples/model_repos/minimal_models/batching/1/.gitkeep
+++ b/3rdparty/backend-r22.12/examples/model_repos/minimal_models/batching/1/.gitkeep
--- a/3rdparty/backend-r22.12/examples/model_repos/minimal_models/batching/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/minimal_models/batching/config.pbtxt
+backend: "minimal"
+max_batch_size: 8
+dynamic_batching {
+  max_queue_delay_microseconds: 5000000
+}
+input [
+  {
+    name: "IN0"
+    data_type: TYPE_INT32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUT0"
+    data_type: TYPE_INT32
+    dims: [ 4 ]
+  }
+]
+instance_group [
+  {
+    kind: KIND_CPU
+  }
+]
--- a/3rdparty/backend-r22.12/examples/model_repos/minimal_models/nonbatching/1/.gitkeep
+++ b/3rdparty/backend-r22.12/examples/model_repos/minimal_models/nonbatching/1/.gitkeep
--- a/3rdparty/backend-r22.12/examples/model_repos/minimal_models/nonbatching/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/minimal_models/nonbatching/config.pbtxt
+backend: "minimal"
+max_batch_size: 0
+input [
+  {
+    name: "IN0"
+    data_type: TYPE_INT32
+    dims: [ 4 ]
+  }
+]
+output [
+  {
+    name: "OUT0"
+    data_type: TYPE_INT32
+    dims: [ 4 ]
+  }
+]
+instance_group [
+  {
+    kind: KIND_CPU
+  }
+]
--- a/3rdparty/backend-r22.12/examples/model_repos/recommended_models/batching/1/.gitkeep
+++ b/3rdparty/backend-r22.12/examples/model_repos/recommended_models/batching/1/.gitkeep
--- a/3rdparty/backend-r22.12/examples/model_repos/recommended_models/batching/config.pbtxt
+++ b/3rdparty/backend-r22.12/examples/model_repos/recommended_models/batching/config.pbtxt
+backend: "recommended"
+max_batch_size: 8
+dynamic_batching {
+  max_queue_delay_microseconds: 5000000
+}
+input [
+  {
+    name: "INPUT"
+    data_type: TYPE_FP32
+    dims: [ 4, 4 ]
+  }
+]
+output [
+  {
+    name: "OUTPUT"
+    data_type: TYPE_FP32
+    dims: [ 4, 4 ]
+  }
+]
+instance_group [
+  {
+    kind: KIND_CPU
+  }
+]
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_common.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_common.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <chrono>
+#include <condition_variable>
+#include <deque>
+#include <iostream>
+#include <mutex>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "triton/common/error.h"
+#include "triton/core/tritonbackend.h"
+
+#define TRITONJSON_STATUSTYPE TRITONSERVER_Error*
+#define TRITONJSON_STATUSRETURN(M) \
+  return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str())
+#define TRITONJSON_STATUSSUCCESS nullptr
+#include "triton/common/triton_json.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+#define IGNORE_ERROR(X)                   \
+  do {                                    \
+    TRITONSERVER_Error* ie_err__ = (X);   \
+    if (ie_err__ != nullptr) {            \
+      TRITONSERVER_ErrorDelete(ie_err__); \
+    }                                     \
+  } while (false)
+
+#define LOG_IF_ERROR(X, MSG)                                                   \
+  do {                                                                         \
+    TRITONSERVER_Error* lie_err__ = (X);                                       \
+    if (lie_err__ != nullptr) {                                                \
+      IGNORE_ERROR(TRITONSERVER_LogMessage(                                    \
+          TRITONSERVER_LOG_INFO, __FILE__, __LINE__,                           \
+          (std::string(MSG) + ": " + TRITONSERVER_ErrorCodeString(lie_err__) + \
+           " - " + TRITONSERVER_ErrorMessage(lie_err__))                       \
+              .c_str()));                                                      \
+      TRITONSERVER_ErrorDelete(lie_err__);                                     \
+    }                                                                          \
+  } while (false)
+
+#define LOG_MESSAGE(LEVEL, MSG)                                  \
+  do {                                                           \
+    LOG_IF_ERROR(                                                \
+        TRITONSERVER_LogMessage(LEVEL, __FILE__, __LINE__, MSG), \
+        ("failed to log message: "));                            \
+  } while (false)
+
+
+#define RETURN_ERROR_IF_FALSE(P, C, MSG)              \
+  do {                                                \
+    if (!(P)) {                                       \
+      return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \
+    }                                                 \
+  } while (false)
+
+#define RETURN_ERROR_IF_TRUE(P, C, MSG)               \
+  do {                                                \
+    if ((P)) {                                        \
+      return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \
+    }                                                 \
+  } while (false)
+
+#define RETURN_IF_ERROR(X)               \
+  do {                                   \
+    TRITONSERVER_Error* rie_err__ = (X); \
+    if (rie_err__ != nullptr) {          \
+      return rie_err__;                  \
+    }                                    \
+  } while (false)
+
+#ifdef TRITON_ENABLE_GPU
+#define LOG_IF_CUDA_ERROR(X, MSG)                                    \
+  do {                                                               \
+    cudaError_t lice_err__ = (X);                                    \
+    if (lice_err__ != cudaSuccess) {                                 \
+      IGNORE_ERROR(TRITONSERVER_LogMessage(                          \
+          TRITONSERVER_LOG_INFO, __FILE__, __LINE__,                 \
+          (std::string(MSG) + ": " + cudaGetErrorString(lice_err__)) \
+              .c_str()));                                            \
+    }                                                                \
+  } while (false)
+
+#define RETURN_IF_CUDA_ERROR(X, C, MSG)                                \
+  do {                                                                 \
+    cudaError_t rice_err__ = (X);                                      \
+    if (rice_err__ != cudaSuccess) {                                   \
+      return TRITONSERVER_ErrorNew(                                    \
+          C, ((MSG) + ": " + cudaGetErrorString(rice_err__)).c_str()); \
+    }                                                                  \
+  } while (false)
+#endif  // TRITON_ENABLE_GPU
+
+#define RESPOND_AND_SET_NULL_IF_ERROR(RESPONSE_PTR, X)               \
+  do {                                                               \
+    TRITONSERVER_Error* rarie_err__ = (X);                           \
+    if (rarie_err__ != nullptr) {                                    \
+      if (*RESPONSE_PTR != nullptr) {                                \
+        LOG_IF_ERROR(                                                \
+            TRITONBACKEND_ResponseSend(                              \
+                *RESPONSE_PTR, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                rarie_err__),                                        \
+            "failed to send error response");                        \
+        *RESPONSE_PTR = nullptr;                                     \
+      }                                                              \
+      TRITONSERVER_ErrorDelete(rarie_err__);                         \
+    }                                                                \
+  } while (false)
+
+#define RESPOND_ALL_AND_SET_NULL_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
+  do {                                                                   \
+    TRITONSERVER_Error* raasnie_err__ = (X);                             \
+    if (raasnie_err__ != nullptr) {                                      \
+      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {            \
+        if (RESPONSES[ridx] != nullptr) {                                \
+          LOG_IF_ERROR(                                                  \
+              TRITONBACKEND_ResponseSend(                                \
+                  RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \
+                  raasnie_err__),                                        \
+              "failed to send error response");                          \
+          RESPONSES[ridx] = nullptr;                                     \
+        }                                                                \
+      }                                                                  \
+      TRITONSERVER_ErrorDelete(raasnie_err__);                           \
+    }                                                                    \
+  } while (false)
+
+#define RESPOND_ALL_AND_SET_TRUE_IF_ERROR(RESPONSES, RESPONSES_COUNT, BOOL, X) \
+  do {                                                                         \
+    TRITONSERVER_Error* raasnie_err__ = (X);                                   \
+    if (raasnie_err__ != nullptr) {                                            \
+      BOOL = true;                                                             \
+      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {                  \
+        if (RESPONSES[ridx] != nullptr) {                                      \
+          LOG_IF_ERROR(                                                        \
+              TRITONBACKEND_ResponseSend(                                      \
+                  RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL,       \
+                  raasnie_err__),                                              \
+              "failed to send error response");                                \
+          RESPONSES[ridx] = nullptr;                                           \
+        }                                                                      \
+      }                                                                        \
+      TRITONSERVER_ErrorDelete(raasnie_err__);                                 \
+    }                                                                          \
+  } while (false)
+
+#ifdef TRITON_ENABLE_STATS
+#define TIMESPEC_TO_NANOS(TS) ((TS).tv_sec * 1000000000 + (TS).tv_nsec)
+#define SET_TIMESTAMP(TS_NS)                                         \
+  {                                                                  \
+    TS_NS = std::chrono::duration_cast<std::chrono::nanoseconds>(    \
+                std::chrono::steady_clock::now().time_since_epoch()) \
+                .count();                                            \
+  }
+#define DECL_TIMESTAMP(TS_NS) \
+  uint64_t TS_NS;             \
+  SET_TIMESTAMP(TS_NS);
+#else
+#define DECL_TIMESTAMP(TS_NS)
+#define SET_TIMESTAMP(TS_NS)
+#endif  // TRITON_ENABLE_STATS
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+/// Convenience deleter for TRITONBACKEND_ResponseFactory.
+struct ResponseFactoryDeleter {
+  void operator()(TRITONBACKEND_ResponseFactory* f)
+  {
+    LOG_IF_ERROR(
+        TRITONBACKEND_ResponseFactoryDelete(f),
+        "failed deleting response factory");
+  }
+};
+
+// A representation of the BatchInput message in model config
+class BatchInput {
+ public:
+  enum class Kind {
+    BATCH_ELEMENT_COUNT,
+    BATCH_ACCUMULATED_ELEMENT_COUNT,
+    BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO,
+    BATCH_MAX_ELEMENT_COUNT_AS_SHAPE,
+    BATCH_ITEM_SHAPE,
+    BATCH_ITEM_SHAPE_FLATTEN
+  };
+  static TRITONSERVER_Error* ParseFromModelConfig(
+      triton::common::TritonJson::Value& config,
+      std::vector<BatchInput>* batch_inputs);
+  const std::vector<std::string>& TargetNames() const { return target_names_; }
+  TRITONSERVER_DataType DataType() const { return data_type_; }
+  Kind BatchInputKind() const { return kind_; }
+  std::string BatchInputKindString() const { return kind_str_; }
+  const std::vector<std::string>& SourceInputs() const
+  {
+    return source_inputs_;
+  }
+
+ private:
+  TRITONSERVER_Error* Init(triton::common::TritonJson::Value& bi_config);
+  Kind kind_;
+  std::string kind_str_;
+  std::vector<std::string> target_names_;
+  TRITONSERVER_DataType data_type_;
+  std::vector<std::string> source_inputs_;
+};
+
+// A representation of the BatchOutput message in model config
+class BatchOutput {
+ public:
+  enum class Kind { BATCH_SCATTER_WITH_INPUT_SHAPE };
+  static TRITONSERVER_Error* ParseFromModelConfig(
+      triton::common::TritonJson::Value& config,
+      std::vector<BatchOutput>* batch_outputs);
+  const std::vector<std::string>& TargetNames() const { return target_names_; }
+  TRITONSERVER_DataType DataType() const { return data_type_; }
+  const std::vector<int64_t>& OutputShape() const { return shape_; }
+  Kind BatchOutputKind() const { return kind_; }
+  const std::vector<std::string>& SourceInputs() const
+  {
+    return source_inputs_;
+  }
+
+ private:
+  Kind kind_;
+  std::vector<std::string> target_names_;
+  TRITONSERVER_DataType data_type_;
+  std::vector<int64_t> shape_;
+  std::vector<std::string> source_inputs_;
+};
+
+struct CopyParams {
+  CopyParams(void* dst, const void* src, const size_t byte_size)
+      : dst_(dst), src_(src), byte_size_(byte_size)
+  {
+  }
+
+  void* dst_;
+  const void* src_;
+  const size_t byte_size_;
+};
+
+/// The value for a dimension in a shape that indicates that that
+/// dimension can take on any size.
+constexpr int WILDCARD_DIM = -1;
+
+constexpr char kTensorRTExecutionAccelerator[] = "tensorrt";
+constexpr char kOpenVINOExecutionAccelerator[] = "openvino";
+constexpr char kGPUIOExecutionAccelerator[] = "gpu_io";
+constexpr char kAutoMixedPrecisionExecutionAccelerator[] =
+    "auto_mixed_precision";
+
+TRITONSERVER_MemoryType GetUsePinnedMemoryType(
+    TRITONSERVER_MemoryType ref_buffer_type);
+
+TRITONSERVER_Error* CommonErrorToTritonError(triton::common::Error error);
+
+TRITONSERVER_Error_Code StatusCodeToTritonCode(
+    triton::common::Error::Code error_code);
+
+/// Parse an array in a JSON object into the corresponding shape. The
+/// array must be composed of integers.
+///
+/// \param io The JSON object containing the member array.
+/// \param name The name of the array member in the JSON object.
+/// \param shape Returns the shape.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseShape(
+    common::TritonJson::Value& io, const std::string& name,
+    std::vector<int64_t>* shape);
+
+/// Return the string representation of a shape.
+///
+/// \param dims The shape dimensions.
+/// \param dims_count The number of dimensions.
+/// \return The string representation.
+std::string ShapeToString(const int64_t* dims, const size_t dims_count);
+
+/// Return the string representation of a shape.
+///
+/// \param shape The shape as a vector of dimensions.
+/// \return The string representation.
+std::string ShapeToString(const std::vector<int64_t>& shape);
+
+/// Return the number of elements of a shape.
+///
+/// \param dims The shape dimensions.
+/// \param dims_count The number of dimensions.
+/// \return The number of elements.
+int64_t GetElementCount(const int64_t* dims, const size_t dims_count);
+
+/// Return the number of elements of a shape.
+///
+/// \param shape The shape as a vector of dimensions.
+/// \return The number of elements.
+int64_t GetElementCount(const std::vector<int64_t>& shape);
+
+/// Get the size, in bytes, of a tensor based on datatype and
+/// shape.
+/// \param dtype The data-type.
+/// \param dims The shape.
+/// \return The size, in bytes, of the corresponding tensor, or -1 if
+/// unable to determine the size.
+int64_t GetByteSize(
+    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims);
+
+/// Get an input tensor's contents into a buffer. This overload expects
+/// both 'buffer' and buffers of the input to be in CPU.
+///
+/// \param request The inference request.
+/// \param input_name The name of the input buffer.
+/// \param buffer The buffer where the input tensor content is copied into.
+/// \param buffer_byte_size Acts as both input and output. On input
+/// gives the size of 'buffer', in bytes. The function will fail if
+/// the buffer is not large enough to hold the input tensor
+/// contents. Returns the size of the input tensor data returned in
+/// 'buffer'.
+/// \param host_policy_name The host policy name to look up the input buffer.
+/// Default input buffer will be used if nullptr is provided.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ReadInputTensor(
+    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
+    size_t* buffer_byte_size, const char* host_policy_name = nullptr);
+
+/// Get an input tensor's contents into a buffer. This overload of
+/// 'ReadInputTensor' supports input buffers that can be in any memory.
+///
+/// \param request The inference request.
+/// \param input_name The name of the input buffer.
+/// \param buffer The buffer where the input tensor content is copied into.
+/// \param buffer_byte_size Acts as both input and output. On input
+/// gives the size of 'buffer', in bytes. The function will fail if
+/// the buffer is not large enough to hold the input tensor
+/// contents. Returns the size of the input tensor data returned in
+/// 'buffer'.
+/// \param host_policy_name The host policy name to look up the input buffer.
+/// Default input buffer will be used if nullptr is provided.
+/// \param memory_type The memory type of the buffer provided.
+/// \param memory_type_id The memory type id of the buffer provided.
+/// \param cuda_stream specifies the stream to be associated with, and 0 can be
+/// passed for default stream.
+/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
+/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
+/// is completed.
+/// \param copy_on_stream whether the memory copies should be performed in cuda
+/// host functions on the 'cuda_stream'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ReadInputTensor(
+    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,
+    size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,
+    const char* host_policy_name = nullptr, const bool copy_on_stream = false);
+
+/// Validate that an input matches one of the allowed input names.
+/// \param io The model input.
+/// \param allowed The set of allowed input names.
+/// \return The error status. A non-OK status indicates the input
+/// is not valid.
+TRITONSERVER_Error* CheckAllowedModelInput(
+    common::TritonJson::Value& io, const std::set<std::string>& allowed);
+
+/// Validate that an output matches one of the allowed output names.
+/// \param io The model output.
+/// \param allowed The set of allowed output names.
+/// \return The error status. A non-OK status indicates the output
+/// is not valid.
+TRITONSERVER_Error* CheckAllowedModelOutput(
+    common::TritonJson::Value& io, const std::set<std::string>& allowed);
+
+/// Get the tensor name, false value, and true value for a boolean
+/// sequence batcher control kind. If 'required' is true then must
+/// find a tensor for the control. If 'required' is false, return
+/// 'tensor_name' as empty-string if the control is not mapped to any
+/// tensor.
+///
+/// \param batcher The JSON object of the sequence batcher.
+/// \param model_name The name of the model.
+/// \param control_kind The kind of control tensor to look for.
+/// \param required Whether the tensor must be specified.
+/// \param tensor_name Returns the name of the tensor.
+/// \param tensor_datatype Returns the data type of the tensor.
+/// \param fp32_false_value Returns the float value for false if
+/// the tensor type is FP32.
+/// \param fp32_true_value Returns the float value for true if
+/// the tensor type is FP32.
+/// \param int32_false_value Returns the int value for false if
+/// the tensor type is INT32.
+/// \param int32_true_value Returns the int value for true if
+/// the tensor type is INT32.
+/// \param bool_false_value Returns the bool value for false if
+/// the tensor type is BOOL.
+/// \param bool_true_value Returns the bool value for true if
+/// the tensor type is BOOL.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* GetBooleanSequenceControlProperties(
+    common::TritonJson::Value& batcher, const std::string& model_name,
+    const std::string& control_kind, const bool required,
+    std::string* tensor_name, std::string* tensor_datatype,
+    float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,
+    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value);
+
+/// Get the tensor name and datatype for a non-boolean sequence
+/// batcher control kind. If 'required' is true then must find a
+/// tensor for the control. If 'required' is false, return
+/// 'tensor_name' as empty-string if the control is not mapped to any
+/// tensor. 'tensor_datatype' returns the required datatype for the
+/// control.
+///
+/// \param batcher The JSON object of the sequence batcher.
+/// \param model_name The name of the model.
+/// \param control_kind The kind of control tensor to look for.
+/// \param required Whether the tensor must be specified.
+/// \param tensor_name Returns the name of the tensor.
+/// \param tensor_datatype Returns the data type of the tensor.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* GetTypedSequenceControlProperties(
+    common::TritonJson::Value& batcher, const std::string& model_name,
+    const std::string& control_kind, const bool required,
+    std::string* tensor_name, std::string* tensor_datatype);
+
+/// Create and send an error response for a set of requests. This
+/// function takes ownership of 'response_err' and so the caller must
+/// not access or delete it after this call returns.
+///
+/// \param requests The requests.
+/// \param request_count The number of 'requests'.
+/// \param response_err The error to send to each request.
+/// \param release_request If true, the requests will be released after
+/// sending the error responses and the request pointers are set to
+/// nullptr.
+void RequestsRespondWithError(
+    TRITONBACKEND_Request** requests, const uint32_t request_count,
+    TRITONSERVER_Error* response_err, const bool release_request = true);
+
+/// Send an error response for a set of responses. This function takes
+/// ownership of 'response_err' and so the caller must not access or
+/// delete it after this call returns.
+///
+/// \param responses The responses.
+/// \param response_count The number of 'responses'.
+/// \param response_err The error to send.
+void SendErrorForResponses(
+    std::vector<TRITONBACKEND_Response*>* responses,
+    const uint32_t response_count, TRITONSERVER_Error* response_err);
+
+/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location
+/// is identified by the memory type and id, and the corresponding copy will be
+/// initiated.
+/// \param msg The message to be prepended in error message.
+/// \param src_memory_type The memory type of the source buffer.
+/// \param src_memory_type_id The memory type id of the source buffer.
+/// \param dst_memory_type The memory type of the destination buffer.
+/// \param dst_memory_type_id The memory type id of the destination buffer.
+/// \param byte_size The byte size of the source buffer.
+/// \param src The pointer to the source buffer.
+/// \param dst The pointer to the destination buffer.
+/// \param cuda_stream specifies the stream to be associated with, and 0 can be
+/// passed for default stream.
+/// \param cuda_used returns whether a CUDA memory copy is initiated. If true,
+/// the caller should synchronize on the given 'cuda_stream' to ensure data copy
+/// is completed.
+/// \param copy_on_stream whether the memory copies should be performed in cuda
+/// host functions on the 'cuda_stream'.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* CopyBuffer(
+    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,
+    const int64_t src_memory_type_id,
+    const TRITONSERVER_MemoryType dst_memory_type,
+    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,
+    void* dst, cudaStream_t cuda_stream, bool* cuda_used,
+    const bool copy_on_stream = false);
+
+/// Does a file or directory exist?
+/// \param path The path to check for existance.
+/// \param exists Returns true if file/dir exists
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* FileExists(const std::string& path, bool* exists);
+
+/// Read a text file into a string.
+/// \param path The path of the file.
+/// \param contents Returns the contents of the file.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ReadTextFile(
+    const std::string& path, std::string* contents);
+
+/// Is a path a directory?
+/// \param path The path to check.
+/// \param is_dir Returns true if path represents a directory
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* IsDirectory(const std::string& path, bool* is_dir);
+
+/// Join path segments into a longer path
+/// \param segments The path segments.
+/// \return the path formed by joining the segments.
+std::string JoinPath(std::initializer_list<std::string> segments);
+
+/// Returns the content in the model version path and the path to the content as
+/// key-value pair.
+/// \param model_repository_path The path to the model repository.
+/// \param version The version of the model.
+/// \param ignore_directories Whether the directories will be ignored.
+/// \param ignore_files Whether the files will be ignored.
+/// \param model_paths Returns the content in the model version path and
+/// the path to the content.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ModelPaths(
+    const std::string& model_repository_path, uint64_t version,
+    const bool ignore_directories, const bool ignore_files,
+    std::unordered_map<std::string, std::string>* model_paths);
+
+/// Create a CUDA stream appropriate for GPU<->CPU data transfer
+/// operations for a given GPU device. The caller takes ownership of
+/// the stream. 'stream' returns nullptr if GPU support is disabled.
+///
+/// \param device_id The ID of the GPU.
+/// \param priority The stream priority. Use 0 for normal priority.
+/// \param stream Returns the created stream.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* CreateCudaStream(
+    const int device_id, const int cuda_stream_priority, cudaStream_t* stream);
+
+/// Parse the string as long long integer.
+///
+/// \param value The string.
+/// \param parse_value The long long integral value of the string.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseLongLongValue(
+    const std::string& value, int64_t* parsed_value);
+
+/// Parse the string as unsigned long long integer.
+///
+/// \param value The string.
+/// \param parse_value The unsigned long long integral value of the string.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseUnsignedLongLongValue(
+    const std::string& value, uint64_t* parsed_value);
+
+/// Parse the string as boolean.
+///
+/// \param value The string.
+/// \param parse_value The boolean value of the string.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseBoolValue(
+    const std::string& value, bool* parsed_value);
+
+/// Parse the string as integer.
+///
+/// \param value The string.
+/// \param parse_value The integral value of the string.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseIntValue(const std::string& value, int* parsed_value);
+
+/// Parse the string as double.
+///
+/// \param value The string.
+/// \param parse_value The double value of the string.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* ParseDoubleValue(
+    const std::string& value, double* parsed_value);
+
+/// Return the value of the specified key in a JSON object.
+///
+/// \param params The JSON object containing the key-value mapping.
+/// \param key The key to look up the value in the JSON object.
+/// \param value Returns the value.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* GetParameterValue(
+    triton::common::TritonJson::Value& params, const std::string& key,
+    std::string* value);
+
+/// Return the Triton server data type of the data type string specified
+/// in model config JSON.
+///
+/// \param data_type_str The string representation of the data type.
+/// \return the Triton server data type.
+TRITONSERVER_DataType ModelConfigDataTypeToTritonServerDataType(
+    const std::string& data_type_str);
+
+/// Try to parse the requested parameter.
+///
+/// \param params The param in model config
+/// \param mkey Key in the model config.
+/// \param value The parsed string value.
+/// \param default_value Default value to use when key is not found.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    std::string* value, const std::string& default_value);
+
+/// Try to parse the requested parameter.
+///
+/// \param params The param in model config
+/// \param mkey Key in the model config.
+/// \param value The parsed int value.
+/// \param default_value Default value to use when key is not found.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    int* value, const int& default_value);
+
+/// Try to parse the requested parameter.
+///
+/// \param params The param in model config
+/// \param mkey Key in the model config.
+/// \param value The parsed bool value.
+/// \param default_value Default value to use when key is not found.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    bool* value, const bool& default_value);
+
+/// Try to parse the requested parameter.
+///
+/// \param params The param in model config
+/// \param mkey Key in the model config.
+/// \param value The parsed uint64 value.
+/// \param default_value Default value to use when key is not found.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* TryParseModelStringParameter(
+    triton::common::TritonJson::Value& params, const std::string& mkey,
+    uint64_t* value, const uint64_t& default_value);
+
+/// Get a string representation of a tensor buffer.
+///
+/// \param str Returns the string.
+/// \param buffer The base pointer to the tensor buffer.
+/// \param buffer_byte_size The size of the buffer in bytes.
+/// \param datatype The type of the tensor
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_Error* BufferAsTypedString(
+    std::string& str, const char* buffer, size_t buffer_byte_size,
+    TRITONSERVER_DataType datatype);
+
+/// Get the ID of the request as a string formatted for logging.
+///
+/// \param request Request of which to get the ID.
+/// \return a formatted string for logging the request ID.
+std::string GetRequestId(TRITONBACKEND_Request* request);
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_input_collector.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_input_collector.h
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+#include <vector>
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/common/async_work_queue.h"
+#include "triton/common/sync_queue.h"
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+using cudaEvent_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+//
+// BackendInputCollector
+//
+class BackendInputCollector {
+ public:
+  // The caller can optionally provide 'event' for internal synchronization
+  // instead of using 'stream'. If 'host_policy_name' is provided, it must be
+  // valid for the lifetime of the collector
+  explicit BackendInputCollector(
+      TRITONBACKEND_Request** requests, const uint32_t request_count,
+      std::vector<TRITONBACKEND_Response*>* responses,
+      TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
+      cudaStream_t stream, cudaEvent_t event = nullptr,
+      cudaEvent_t buffer_ready_event = nullptr,
+      const size_t kernel_buffer_threshold = 0,
+      const char* host_policy_name = nullptr, const bool copy_on_stream = false,
+      const bool coalesce_request_input = false)
+      : need_sync_(false), requests_(requests), request_count_(request_count),
+        responses_(responses), memory_manager_(memory_manager),
+        pinned_enabled_(pinned_enabled),
+        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
+        stream_(stream), event_(event), buffer_ready_event_(buffer_ready_event),
+        kernel_buffer_threshold_(kernel_buffer_threshold),
+        pending_pinned_byte_size_(0), pending_pinned_offset_(0),
+        pending_copy_kernel_buffer_byte_size_(0),
+        pending_copy_kernel_buffer_offset_(0),
+        pending_copy_kernel_input_buffer_counts_(0), async_task_count_(0),
+        host_policy_cstr_(host_policy_name), copy_on_stream_(copy_on_stream),
+        coalesce_request_input_(coalesce_request_input)
+  {
+  }
+
+  ~BackendInputCollector() = default;
+
+  // Process all requests for a named input tensor and return the
+  // concatenated values of those requests in a single contiguous
+  // buffer. This overload of the function can avoid data copy if the
+  // tensor values are already contiguous and the caller doesn't
+  // provide a destination 'buffer'.
+  //
+  // 'buffer' is used to determine whether the input should be placed at the
+  //   'buffer' provided by the caller. If 'buffer' == nullptr, the returned
+  //   buffer will be managed by the BackendInputCollector object and
+  //   has the same lifecycle as the BackendInputCollector object.
+  // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.
+  // 'allowed_input_types' is the ordered list of the memory type and id pairs
+  //   that the returned buffer can be. It must only contain the memory type
+  //   and id of 'buffer' if 'buffer' is not nullptr.
+  // 'dst_buffer' returns the contiguous buffer of the input tensor.
+  // 'dst_buffer_byte_size' the byte size of 'dst_buffer'.
+  // 'dst_memory_type' returns the memory type of 'dst_buffer'.
+  // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.
+  TRITONSERVER_Error* ProcessTensor(
+      const char* input_name, char* buffer, const size_t buffer_byte_size,
+      const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
+          allowed_input_types,
+      const char** dst_buffer, size_t* dst_buffer_byte_size,
+      TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);
+
+  // Process all requests for a named input tensor and return the
+  // concatenated values of those requests in a single contiguous
+  // 'buffer'.
+  //
+  // 'buffer' The buffer to hold the concatenates tensor value. Must
+  // be large enough to hold all tensor value.
+  // 'buffer_byte_size' is the byte size of 'buffer'.
+  // 'dst_memory_type' The memory type of 'buffer'.
+  // 'dst_memory_type_id' The memory type id of 'buffer'.
+  void ProcessTensor(
+      const char* input_name, char* buffer, const size_t buffer_byte_size,
+      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);
+
+  // Process the batch input and return its shape. Returning error indicates
+  // that the batch input can't be formed properly and the caller should abort
+  // the whole batch.
+  TRITONSERVER_Error* BatchInputShape(
+      const BatchInput& batch_input, std::vector<int64_t>* shape);
+
+  // Process the batch input and derive its value into 'buffer'. Returning
+  // error indicates that the batch input can't be formed properly and
+  // the caller should abort the whole batch.
+  // 'buffer' is used to determine whether the input should be placed at the
+  //   'buffer' provided by the caller. If 'buffer' == nullptr, the returned
+  //   buffer will be managed by the BackendInputCollector object and
+  //   has the same lifecycle as the BackendInputCollector object.
+  // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.
+  // 'allowed_input_types' is the ordered list of the memory type and id pairs
+  //   that the returned buffer can be. It must only contain the memory type
+  //   and id of 'buffer' if it is not nullptr.
+  // 'dst_buffer' returns the contiguous buffer of the input tensor.
+  // 'dst_memory_type' returns the memory type of 'dst_buffer'.
+  // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.
+  TRITONSERVER_Error* ProcessBatchInput(
+      const BatchInput& batch_input, char* buffer,
+      const size_t buffer_byte_size,
+      const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&
+          allowed_input_types,
+      const char** dst_buffer, size_t* dst_buffer_byte_size,
+      TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);
+
+  // Finalize processing of all requests for all input tensors. Return
+  // true if cudaMemcpyAsync is called, and the caller should call
+  // cudaStreamSynchronize (or cudaEventSynchronize on 'event') before
+  // using the data.
+  bool Finalize();
+
+ private:
+  struct ContiguousBuffer {
+    ContiguousBuffer() : start_request_idx_(0), end_request_idx_(0) {}
+    MemoryDesc memory_desc_;
+    size_t start_request_idx_;
+    size_t end_request_idx_;
+  };
+
+  class InputIterator {
+   public:
+    InputIterator(
+        TRITONBACKEND_Request** requests, const uint32_t request_count,
+        std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,
+        const char* host_policy_name, const bool coalesce_request_input);
+
+    // Return false if iterator reaches the end of inputs, 'input' is not set.
+    bool GetNextContiguousInput(ContiguousBuffer* input);
+
+   private:
+    TRITONBACKEND_Request** requests_;
+    const uint32_t request_count_;
+    std::vector<TRITONBACKEND_Response*>* responses_;
+    const char* input_name_;
+    const char* host_policy_;
+    const bool coalesce_request_input_;
+
+    TRITONBACKEND_Input* curr_input_;
+    size_t curr_request_idx_;
+    size_t curr_buffer_idx_;
+    uint32_t curr_buffer_cnt_;
+    bool reach_end_;
+  };
+
+  // Return whether the entire input is in a contiguous buffer. If returns true,
+  // the properties of the contiguous input buffer will also be returned.
+  // Otherwise, only 'buffer_byte_size' will be set and return the total byte
+  // size of the input.
+  bool GetInputBufferIfContiguous(
+      const char* input_name, const char** buffer, size_t* buffer_byte_size,
+      TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);
+  bool FlushPendingPinned(
+      char* tensor_buffer, const size_t tensor_buffer_byte_size,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id);
+  bool FlushPendingCopyKernel(
+      char* tensor_buffer, const size_t tensor_buffer_byte_size,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id);
+  TRITONSERVER_Error* LaunchCopyKernel(
+      char* tensor_buffer, const size_t tensor_buffer_byte_size,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id);
+  bool SetInputTensor(
+      const char* input_name, const ContiguousBuffer& input,
+      char* tensor_buffer, const size_t tensor_buffer_byte_size,
+      const TRITONSERVER_MemoryType tensor_memory_type,
+      const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,
+      const TRITONSERVER_MemoryType use_pinned_memory_type,
+      const bool use_kernel, const bool wait_buffer);
+  template <typename T>
+  TRITONSERVER_Error* SetElementCount(
+      const std::string& source_input, char* buffer,
+      const size_t buffer_byte_size);
+  template <typename T>
+  TRITONSERVER_Error* SetAccumulatedElementCount(
+      const std::string& source_input, char* buffer,
+      const size_t buffer_byte_size);
+  template <typename T>
+  TRITONSERVER_Error* SetBatchItemShape(
+      const std::string& source_input, char* buffer,
+      const size_t buffer_byte_size);
+
+  bool need_sync_;
+  TRITONBACKEND_Request** requests_;
+  const uint32_t request_count_;
+  std::vector<TRITONBACKEND_Response*>* responses_;
+  TRITONBACKEND_MemoryManager* memory_manager_;
+  const bool pinned_enabled_;
+  const bool use_async_cpu_copy_;
+  cudaStream_t stream_;
+  cudaEvent_t event_;
+  cudaEvent_t buffer_ready_event_;
+  const size_t kernel_buffer_threshold_;
+
+  size_t pending_pinned_byte_size_;
+  size_t pending_pinned_offset_;
+  std::list<ContiguousBuffer> pending_pinned_input_buffers_;
+
+  // managed memories that need to live over the lifetime of this
+  // BackendInputCollector object.
+  std::list<std::unique_ptr<BackendMemory>> in_use_memories_;
+
+  size_t pending_copy_kernel_buffer_byte_size_;
+  size_t pending_copy_kernel_buffer_offset_;
+  size_t pending_copy_kernel_input_buffer_counts_;
+  std::list<ContiguousBuffer> pending_copy_kernel_input_buffers_;
+  std::vector<std::unique_ptr<std::vector<int8_t*>>> input_ptr_buffer_host_;
+  std::vector<std::unique_ptr<std::vector<size_t>>> byte_size_buffer_host_;
+  std::vector<std::unique_ptr<std::vector<size_t>>>
+      byte_size_offset_buffer_host_;
+
+  // Pinned memory buffers and the corresponding request_inputs where
+  // the final copy to the tensor is deferred until Finalize() after
+  // waiting for all in-flight copies.
+  struct DeferredPinned {
+    DeferredPinned(
+        char* pinned_memory, const size_t pinned_memory_size,
+        char* tensor_buffer, const size_t tensor_buffer_offset,
+        const TRITONSERVER_MemoryType tensor_memory_type,
+        const int64_t tensor_memory_id,
+        std::list<ContiguousBuffer>&& request_buffers,
+        std::vector<TRITONBACKEND_Response*>* responses)
+        : finalized_(false), pinned_memory_(pinned_memory),
+          pinned_memory_size_(pinned_memory_size),
+          tensor_buffer_(tensor_buffer),
+          tensor_buffer_offset_(tensor_buffer_offset),
+          tensor_memory_type_(tensor_memory_type),
+          tensor_memory_id_(tensor_memory_id),
+          requests_(std::move(request_buffers)), responses_(responses)
+    {
+    }
+
+    bool Finalize(cudaStream_t stream);
+    bool finalized_;
+    // Holding reference to the pinned memory buffer, which is managed
+    // by BackendInputCollector as 'pinned_memory'
+    char* pinned_memory_;
+    const size_t pinned_memory_size_;
+    char* tensor_buffer_;
+    const size_t tensor_buffer_offset_;
+    const TRITONSERVER_MemoryType tensor_memory_type_;
+    const int64_t tensor_memory_id_;
+    std::list<ContiguousBuffer> requests_;
+    std::vector<TRITONBACKEND_Response*>* responses_;
+  };
+
+  std::list<DeferredPinned> deferred_pinned_;
+  // FIXME use future to maintain an issue-order queue to drop task count
+  triton::common::SyncQueue<bool> completion_queue_;
+  size_t async_task_count_;
+
+  const char* host_policy_cstr_;
+  const bool copy_on_stream_;
+  const bool coalesce_request_input_;
+};
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_memory.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_memory.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+namespace triton { namespace backend {
+
+// Colletion of common properties that describes a buffer in Triton
+struct MemoryDesc {
+  MemoryDesc()
+      : buffer_(nullptr), byte_size_(0), memory_type_(TRITONSERVER_MEMORY_CPU),
+        memory_type_id_(0)
+  {
+  }
+  MemoryDesc(
+      const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id)
+      : buffer_(buffer), byte_size_(byte_size), memory_type_(memory_type),
+        memory_type_id_(memory_type_id)
+  {
+  }
+  const char* buffer_;
+  size_t byte_size_;
+  TRITONSERVER_MemoryType memory_type_;
+  int64_t memory_type_id_;
+};
+
+//
+// BackendMemory
+//
+// Utility class for allocating and deallocating memory using both
+// TRITONBACKEND_MemoryManager and direct GPU and CPU malloc/free.
+//
+class BackendMemory {
+ public:
+  enum class AllocationType { CPU, CPU_PINNED, GPU, CPU_PINNED_POOL, GPU_POOL };
+
+  // Allocate a contiguous block of 'alloc_type' memory.  'mem'
+  // returns the pointer to the allocated memory.
+  //
+  // CPU, CPU_PINNED_POOL and GPU_POOL are allocated using
+  // TRITONBACKEND_MemoryManagerAllocate. Note that CPU_PINNED and GPU
+  // allocations can be much slower than the POOL variants.
+  //
+  // Two error codes have specific interpretations for this function:
+  //
+  //   TRITONSERVER_ERROR_UNSUPPORTED: Indicates that function is
+  //     incapable of allocating the requested memory type and memory
+  //     type ID. Requests for the memory type and ID will always fail
+  //     no matter 'byte_size' of the request.
+  //
+  //   TRITONSERVER_ERROR_UNAVAILABLE: Indicates that function can
+  //      allocate the memory type and ID but that currently it cannot
+  //      allocate a contiguous block of memory of the requested
+  //      'byte_size'.
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
+      const int64_t memory_type_id, const size_t byte_size,
+      BackendMemory** mem);
+
+  // Allocate a contiguous block of memory by attempting the
+  // allocation using 'alloc_types' in order until one is successful.
+  // See BackendMemory::Create() above for details.
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_MemoryManager* manager,
+      const std::vector<AllocationType>& alloc_types,
+      const int64_t memory_type_id, const size_t byte_size,
+      BackendMemory** mem);
+
+  // Creates a BackendMemory object from a pre-allocated buffer. The buffer
+  // is not owned by the object created with this function. Hence, for
+  // proper operation, the lifetime of the buffer should atleast extend till
+  // the corresponding BackendMemory.
+  static TRITONSERVER_Error* Create(
+      TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,
+      const int64_t memory_type_id, void* buffer, const size_t byte_size,
+      BackendMemory** mem);
+
+  ~BackendMemory();
+
+  AllocationType AllocType() const { return alloctype_; }
+  int64_t MemoryTypeId() const { return memtype_id_; }
+  char* MemoryPtr() { return buffer_; }
+  size_t ByteSize() const { return byte_size_; }
+  TRITONSERVER_MemoryType MemoryType() const
+  {
+    return AllocTypeToMemoryType(alloctype_);
+  }
+
+  static TRITONSERVER_MemoryType AllocTypeToMemoryType(const AllocationType a);
+  static const char* AllocTypeString(const AllocationType a);
+
+ private:
+  BackendMemory(
+      TRITONBACKEND_MemoryManager* manager, const AllocationType alloctype,
+      const int64_t memtype_id, char* buffer, const size_t byte_size,
+      const bool owns_buffer = true)
+      : manager_(manager), alloctype_(alloctype), memtype_id_(memtype_id),
+        buffer_(buffer), byte_size_(byte_size), owns_buffer_(owns_buffer)
+  {
+  }
+
+  TRITONBACKEND_MemoryManager* manager_;
+  AllocationType alloctype_;
+  int64_t memtype_id_;
+  char* buffer_;
+  size_t byte_size_;
+  bool owns_buffer_;
+};
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_model.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_model.h
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include "triton/backend/backend_common.h"
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+namespace triton { namespace backend {
+
+//
+// BackendModel
+//
+// Common functionality for a backend model. This class is provided as
+// a convenience; backends are not required to use this class.
+//
+class BackendModel {
+ public:
+  BackendModel(
+      TRITONBACKEND_Model* triton_model, const bool allow_optional = false);
+  virtual ~BackendModel() = default;
+
+  // Get the handle to the TRITONBACKEND server hosting this model.
+  TRITONSERVER_Server* TritonServer() { return triton_server_; }
+
+  // Get the handle to the memory manager for this model.
+  TRITONBACKEND_MemoryManager* TritonMemoryManager()
+  {
+    return triton_memory_manager_;
+  }
+
+  // Get the handle to the TRITONBACKEND model.
+  TRITONBACKEND_Model* TritonModel() { return triton_model_; }
+
+  // Get the name and version of the model.
+  const std::string& Name() const { return name_; }
+  uint64_t Version() const { return version_; }
+  const std::string& RepositoryPath() const { return repository_path_; }
+
+  // The model configuration.
+  common::TritonJson::Value& ModelConfig() { return model_config_; }
+
+  // Sets the updated model configuration to the core.
+  TRITONSERVER_Error* SetModelConfig();
+
+  // Parses information out of the model configuration.
+  TRITONSERVER_Error* ParseModelConfig();
+
+  // Maximum batch size supported by the model. A value of 0
+  // indicates that the model does not support batching.
+  int MaxBatchSize() const { return max_batch_size_; }
+
+  // Set the max batch size for the model. When a backend
+  // auto-completes a configuration it may set or change the maximum
+  // batch size.
+  void SetMaxBatchSize(const int b) { max_batch_size_ = b; }
+
+  // Does this model support batching in the first dimension?
+  TRITONSERVER_Error* SupportsFirstDimBatching(bool* supports);
+
+  // Use indirect pinned memory buffer when copying an input or output
+  // tensor to/from the model.
+  bool EnablePinnedInput() const { return enable_pinned_input_; }
+  bool EnablePinnedOutput() const { return enable_pinned_output_; }
+
+  const std::vector<BatchInput>& BatchInputs() const { return batch_inputs_; }
+  const std::vector<BatchOutput>& BatchOutputs() const
+  {
+    return batch_outputs_;
+  }
+  const BatchOutput* FindBatchOutput(const std::string& output_name) const;
+  bool IsInputRagged(const std::string& input_name) const
+  {
+    return (ragged_inputs_.find(input_name) != ragged_inputs_.end());
+  }
+  bool IsInputOptional(const std::string& input_name) const
+  {
+    return (optional_inputs_.find(input_name) != optional_inputs_.end());
+  }
+
+ protected:
+  TRITONSERVER_Server* triton_server_;
+  TRITONBACKEND_MemoryManager* triton_memory_manager_;
+  TRITONBACKEND_Model* triton_model_;
+  std::string name_;
+  uint64_t version_;
+  std::string repository_path_;
+  bool allow_optional_;
+
+  common::TritonJson::Value model_config_;
+  int max_batch_size_;
+  bool enable_pinned_input_;
+  bool enable_pinned_output_;
+  std::vector<BatchInput> batch_inputs_;
+  std::vector<BatchOutput> batch_outputs_;
+  std::map<std::string, const BatchOutput*> batch_output_map_;
+  std::set<std::string> ragged_inputs_;
+  std::set<std::string> optional_inputs_;
+};
+
+//
+// BackendModelException
+//
+// Exception thrown if error occurs while constructing an
+// BackendModel.
+//
+struct BackendModelException {
+  BackendModelException(TRITONSERVER_Error* err) : err_(err) {}
+  TRITONSERVER_Error* err_;
+};
+
+#define THROW_IF_BACKEND_MODEL_ERROR(X)                        \
+  do {                                                         \
+    TRITONSERVER_Error* tie_err__ = (X);                       \
+    if (tie_err__ != nullptr) {                                \
+      throw triton::backend::BackendModelException(tie_err__); \
+    }                                                          \
+  } while (false)
+
+}}  // namespace triton::backend
--- a/3rdparty/backend-r22.12/include/triton/backend/backend_model_instance.h
+++ b/3rdparty/backend-r22.12/include/triton/backend/backend_model_instance.h
+// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+#include "triton/core/tritonbackend.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace backend {
+
+#ifndef TRITON_ENABLE_GPU
+using cudaStream_t = void*;
+#endif  // !TRITON_ENABLE_GPU
+
+class BackendModel;
+
+//
+// BackendModelInstance
+//
+// Common functionality for a backend model instance. This class is
+// provided as a convenience; backends are not required to use this
+// class.
+//
+class BackendModelInstance {
+ public:
+  BackendModelInstance(
+      BackendModel* backend_model,
+      TRITONBACKEND_ModelInstance* triton_model_instance);
+  virtual ~BackendModelInstance();
+
+  // Get the name, kind and device ID of the instance.
+  const std::string& Name() const { return name_; }
+  TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }
+  int32_t DeviceId() const { return device_id_; }
+
+  // Get the handle to the TRITONBACKEND model instance.
+  TRITONBACKEND_ModelInstance* TritonModelInstance()
+  {
+    return triton_model_instance_;
+  }
+
+  // Get the BackendModel representing the model that corresponds to
+  // this instance.
+  BackendModel* Model() const { return backend_model_; }
+
+  // The model configuration 'default_model_filename' value, or the
+  // value in model configuration 'cc_model_filenames' for the GPU
+  // targeted by this instance. If neither are specified in the model
+  // configuration, the return empty string.
+  const std::string& ArtifactFilename() const { return artifact_filename_; }
+
+  // Returns the stream associated with this instance that can be used
+  // for GPU<->CPU memory transfers. Returns nullptr if GPU support is
+  // disabled or if this instance is not executing on a GPU.
+  cudaStream_t CudaStream() { return stream_; }
+
+  const std::string& HostPolicyName() const { return host_policy_name_; }
+
+ protected:
+  BackendModel* backend_model_;
+  TRITONBACKEND_ModelInstance* triton_model_instance_;
+
+  std::string name_;
+  TRITONSERVER_InstanceGroupKind kind_;
+  int32_t device_id_;
+
+  std::string artifact_filename_;
+  cudaStream_t stream_;
+
+  std::string host_policy_name_;
+};
+
+//
+// BackendModelInstanceException
+//
+// Exception thrown if error occurs while constructing an
+// BackendModelInstance.
+//
+struct BackendModelInstanceException {
+  BackendModelInstanceException(TRITONSERVER_Error* err) : err_(err) {}
+  TRITONSERVER_Error* err_;
+};
+
+#define THROW_IF_BACKEND_INSTANCE_ERROR(X)                             \
+  do {                                                                 \
+    TRITONSERVER_Error* tie_err__ = (X);                               \
+    if (tie_err__ != nullptr) {                                        \
+      throw triton::backend::BackendModelInstanceException(tie_err__); \
+    }                                                                  \
+  } while (false)
+
+}}  // namespace triton::backend