添加下载的代码

b30f3cdb · xiabo · e38ee081 · b30f3cdb · b30f3cdb · b30f3cdb
Commit b30f3cdb authored Nov 14, 2023 by xiabo
17 changed files
--- a/3rdparty/core-r22.12/src/model_repository_manager.h
+++ b/3rdparty/core-r22.12/src/model_repository_manager.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#include <functional>
+#include <map>
+#include <mutex>
+#include <set>
+#include "infer_parameter.h"
+#include "model_config.pb.h"
+#include "model_lifecycle.h"
+#include "status.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+class InferenceServer;
+class Model;
+
+// [FIXME] should have separated load / unload functions for clarity
+enum ActionType { NO_ACTION, LOAD, UNLOAD };
+
+/// Predefined reason strings
+#define MODEL_READY_REASON_DUPLICATE "model appears in two or more repositories"
+
+/// An object to manage the model repository active in the server.
+class ModelRepositoryManager {
+ public:
+  // Index information for a model.
+  struct ModelIndex {
+    ModelIndex(const std::string& n)
+        : name_only_(true), name_(n), version_(-1),
+          state_(ModelReadyState::UNKNOWN)
+    {
+    }
+    ModelIndex(
+        const std::string& n, const int64_t v, const ModelReadyState s,
+        const std::string& r)
+        : name_only_(false), name_(n), version_(v), state_(s), reason_(r)
+    {
+    }
+    const bool name_only_;
+    const std::string name_;
+    const int64_t version_;
+    const ModelReadyState state_;
+    const std::string reason_;
+  };
+
+  /// A basic unit in dependency graph that records the models seen by the model
+  /// repository manager.
+  struct DependencyNode {
+    DependencyNode(const std::string& model_name)
+        : model_name_(model_name), status_(Status::Success), checked_(false)
+    {
+    }
+
+    std::string model_name_;
+    Status status_;
+    bool checked_;
+    bool explicitly_load_;
+    inference::ModelConfig model_config_;
+    std::set<int64_t> loaded_versions_;
+    std::set<DependencyNode*> missing_upstreams_;
+    std::unordered_map<DependencyNode*, std::set<int64_t>> upstreams_;
+    std::set<DependencyNode*> downstreams_;
+  };
+
+  ~ModelRepositoryManager();
+
+  /// Create a manager for a repository.
+  /// \param server The pointer to the inference server.
+  /// \param server_version The version of the inference server.
+  /// \param repository_paths A set of file-system paths of the repositories.
+  /// \param startup_models A set of models to be loaded at startup
+  /// if model control is enabled.
+  /// \param strict_model_config If false attempt to autofill missing required
+  /// information in each model configuration.
+  /// \param polling_enabled If true, then PollAndUpdate() is allowed.
+  /// Otherwise, it is not allowed.
+  /// \param model_control_enabled If true, then LoadUnloadModel() is allowed
+  /// and the models in the model repository will not be loaded at startup.
+  /// Otherwise, LoadUnloadModel() is not allowed and the models will be loaded.
+  /// Cannot be set to true if polling_enabled is true.
+  /// \param life_cycle_options The options to configure ModelLifeCycle.
+  /// \param model_repository_manager Return the model repository manager.
+  /// \return The error status.
+  static Status Create(
+      InferenceServer* server, const std::string& server_version,
+      const std::set<std::string>& repository_paths,
+      const std::set<std::string>& startup_models,
+      const bool strict_model_config, const bool polling_enabled,
+      const bool model_control_enabled,
+      const ModelLifeCycleOptions& life_cycle_options,
+      std::unique_ptr<ModelRepositoryManager>* model_repository_manager);
+
+  /// Poll the model repository to determine the new set of models and
+  /// compare with the current set. And serve the new set of models based
+  /// on their version policy.
+  Status PollAndUpdate();
+
+  /// Load or unload a specified model.
+  /// \param models The models and the parameters to be loaded or unloaded
+  /// \param type The type action to be performed. If the action is LOAD and
+  /// the model has been loaded, the model will be re-loaded.
+  /// \return error status. Return "NOT_FOUND" if it tries to load
+  /// a non-existing model or if it tries to unload a model that hasn't been
+  /// loaded.
+  Status LoadUnloadModel(
+      const std::unordered_map<
+          std::string, std::vector<const InferenceParameter*>>& models,
+      const ActionType type, const bool unload_dependents);
+
+  /// Unload all models. This function should be called before shutting down
+  /// the model repository manager.
+  /// \return error status.
+  Status UnloadAllModels();
+
+  /// Instruct all models to stop accepting new inference requests. However,
+  /// the models are still capable of processing inference requests
+  /// if the model considers them as part of the in-flight inference.
+  /// \return error status.
+  Status StopAllModels();
+
+  /// \return the number of in-flight inferences for the all versions of all
+  /// models. The set element will be a tuple of <model_name, model_version,
+  /// in-flight inference count>. Note that a model version will not be included
+  /// if it doesn't have in-flight inferences.
+  const std::set<std::tuple<std::string, int64_t, size_t>> InflightStatus();
+
+  /// \param strict_readiness If true, only models that have at least one
+  /// ready version will be considered as live. Otherwise, the models that
+  /// have loading / unloading versions will also be live.
+  /// \return the state of all versions of all live models.
+  const ModelStateMap LiveModelStates(bool strict_readiness = false);
+
+  /// \return the state of all versions of all models that have every
+  /// been (attempted) loaded over the lifetime of the server.
+  const ModelStateMap ModelStates();
+
+  /// \return the states of all versions of a specific model.
+  const VersionStateMap VersionStates(const std::string& model_name);
+
+  /// \return the ready-state of a specific model version.
+  Status ModelState(
+      const std::string& model_name, const int64_t model_version,
+      ModelReadyState* state);
+
+  /// Get the index of all models in all repositories.
+  /// \param ready_only If true return only index of models that are ready.
+  /// \param index Returns the index.
+  /// \return error status.
+  Status RepositoryIndex(const bool ready_only, std::vector<ModelIndex>* index);
+
+  /// Obtain the specified model.
+  /// \param model_name The name of the model.
+  /// \param model_version The version of the model.
+  /// \param model Return the model object.
+  /// \return error status.
+  Status GetModel(
+      const std::string& model_name, const int64_t model_version,
+      std::shared_ptr<Model>* model);
+
+  // Register model repository path.
+  /// \param repository Path to model repository.
+  /// \param model_mapping Mapping with (overridden) model name as key, subdir
+  /// name as value.
+  /// \return error status
+  Status RegisterModelRepository(
+      const std::string& repository,
+      const std::unordered_map<std::string, std::string>& model_mapping);
+
+  // Unregister model repository path.
+  /// \param repository Path to model repository.
+  /// \return error status
+  Status UnregisterModelRepository(const std::string& repository);
+
+ private:
+  struct ModelInfo;
+
+  // Map from model name to information about the model.
+  using ModelInfoMap =
+      std::unordered_map<std::string, std::unique_ptr<ModelInfo>>;
+
+  // Set of DependencyNode
+  using NodeSet = std::set<DependencyNode*>;
+
+  ModelRepositoryManager(
+      const std::set<std::string>& repository_paths, const bool autofill,
+      const bool polling_enabled, const bool model_control_enabled,
+      const double min_compute_capability,
+      std::unique_ptr<ModelLifeCycle> life_cycle);
+
+  /// The internal function that are called in Create() and PollAndUpdate().
+  Status PollAndUpdateInternal(bool* all_models_polled);
+
+  /// The internal function that load or unload a set of models.
+  Status LoadUnloadModels(
+      const std::unordered_map<
+          std::string, std::vector<const InferenceParameter*>>& models,
+      const ActionType type, const bool unload_dependents,
+      bool* all_models_polled);
+
+  /// Poll the requested models in the model repository and
+  /// compare with the current set. Return the additions, deletions,
+  /// and modifications that have occurred. This function will not updated
+  /// the current model info, it is caller's responsibility to do so.
+  /// \param models The map from models to be polled to their associated
+  /// parameters.
+  /// \param added The names of the models added to the repository.
+  /// \param deleted The names of the models removed from the repository.
+  /// \param modified The names of the models remaining in the
+  /// repository that have been changed.
+  /// \param unmodified The names of the models remaining in the
+  /// repository that have not changed.
+  /// \param updated_infos The model infos retrieved from the poll.
+  /// \param all_models_polled Return true if all models are polled and
+  /// their model configuration are validated successfully. Instead of aborting
+  /// the polling, the models that fail will be ignored and their model infos
+  /// will stay in the previous state.
+  /// \return The error status.
+  Status Poll(
+      const std::unordered_map<
+          std::string, std::vector<const InferenceParameter*>>& models,
+      std::set<std::string>* added, std::set<std::string>* deleted,
+      std::set<std::string>* modified, std::set<std::string>* unmodified,
+      ModelInfoMap* updated_infos, bool* all_models_polled);
+
+  /// Helper function for Poll() to initialize ModelInfo for the model.
+  /// \param name The name of the model.
+  /// \param path The model path. Empty path means the model is provided via
+  /// 'params'
+  /// \param params The model parameters provided for polling model.
+  /// \param info Return the updated ModelInfo. 'nullptr' will be returned if
+  /// existing ModelInfo for the model should be reused.
+  /// \return The error status.
+  Status InitializeModelInfo(
+      const std::string& name, const std::string& path,
+      const std::vector<const InferenceParameter*>& params,
+      std::unique_ptr<ModelInfo>* info);
+
+  /// Load models based on the dependency graph. The function will iteratively
+  /// load models that all the models they depend on has been loaded, and unload
+  /// models if their dependencies are no longer satisfied.
+  /// \return The status of the model loads.
+  std::map<std::string, Status> LoadModelByDependency();
+
+  /// Helper function to update the dependency graph based on the poll result
+  /// \param added The names of the models added to the repository.
+  /// \param deleted The names of the models removed from the repository.
+  /// \param modified The names of the models remaining in the
+  /// repository that have been changed.
+  /// \param deleted_dependents The names of dependent models to be removed
+  /// from the repository.
+  /// \return The error status.
+  Status UpdateDependencyGraph(
+      const std::set<std::string>& added, const std::set<std::string>& deleted,
+      const std::set<std::string>& modified,
+      std::set<std::string>* deleted_dependents = nullptr);
+
+  /// Helper function to uncheck the nodes because the model that they depends
+  /// on has changed. The unchecked nodes will be validated again.
+  /// The function will be call recursively to uncheck all downstreams.
+  /// \param downstreams The nodes to be unchecked.
+  /// \param updated_nodes Return the nodes that have been unchecked
+  void UncheckDownstream(NodeSet* downstreams, NodeSet* updated_nodes);
+
+  /// Helper function to construct the edges between nodes in dependency graph.
+  /// \param updated_node The node that is newly added or modified.
+  /// \return True if the node represents an ensemble model. False otherwise.
+  bool ConnectDependencyGraph(DependencyNode* updated_node);
+
+  /// Get the model info for a named model.
+  /// \param name The model name.
+  /// \param model_info Returns the model information.
+  /// \return OK if found, NOT_FOUND otherwise.
+  Status GetModelInfo(const std::string& name, ModelInfo** model_info);
+
+  /// Get the models to be loaded / unloaded based on the model loaded in
+  /// previous iteration.
+  /// \param loaded_models The models loaded / unloaded in previous iteration.
+  /// Unloaded models will be represented as models with no loaded versions.
+  /// \return A pair of node set containing models to be loaded and models to be
+  /// unloaded for the next iteration.
+  std::pair<NodeSet, NodeSet> ModelsToLoadUnload(const NodeSet& loaded_models);
+
+  /// Check if the node is ready for the next iteration. A node is ready if the
+  /// node is invalid (containing invalid model config or its depdencies failed
+  /// to load) or all of its dependencies are satisfied.
+  /// \param node The node to be checked.
+  /// \return True if the node is ready. False otherwise.
+  bool CheckNode(DependencyNode* node);
+
+  Status CircularcyCheck(
+      DependencyNode* current_node, const DependencyNode* start_node);
+
+  bool ModelDirectoryOverride(
+      const std::vector<const InferenceParameter*>& model_params);
+
+  std::set<std::string> repository_paths_;
+  const bool autofill_;
+  const bool polling_enabled_;
+  const bool model_control_enabled_;
+  const double min_compute_capability_;
+
+  std::mutex poll_mu_;
+  ModelInfoMap infos_;
+
+  std::unordered_map<std::string, std::unique_ptr<DependencyNode>>
+      dependency_graph_;
+  std::unordered_map<std::string, std::unique_ptr<DependencyNode>>
+      missing_nodes_;
+
+  // Mappings from (overridden) model names to a pair of their repository and
+  // absolute path
+  std::unordered_map<std::string, std::pair<std::string, std::string>>
+      model_mappings_;
+
+  std::unique_ptr<ModelLifeCycle> model_life_cycle_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/numa_utils.cc
+++ b/3rdparty/core-r22.12/src/numa_utils.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include "numa_utils.h"
+
+#ifndef _WIN32
+#include <numa.h>
+#include <numaif.h>
+#endif
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+namespace {
+std::string
+VectorToString(const std::vector<int>& vec)
+{
+  std::string str("[");
+  for (const auto& element : vec) {
+    str += std::to_string(element);
+    str += ",";
+  }
+
+  str += "]";
+  return str;
+}
+
+Status
+ParseIntOption(const std::string& msg, const std::string& arg, int* value)
+{
+  try {
+    *value = std::stoi(arg);
+  }
+  catch (const std::invalid_argument& ia) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        msg + ": Can't parse '" + arg + "' to integer");
+  }
+  return Status::Success;
+}
+
+}  // namespace
+
+// NUMA setting will be ignored on Windows platform
+#ifdef _WIN32
+Status
+SetNumaConfigOnThread(
+    const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  return Status::Success;
+}
+
+Status
+SetNumaMemoryPolicy(const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  return Status::Success;
+}
+
+Status
+GetNumaMemoryPolicyNodeMask(unsigned long* node_mask)
+{
+  *node_mask = 0;
+  return Status::Success;
+}
+
+Status
+ResetNumaMemoryPolicy()
+{
+  return Status::Success;
+}
+
+Status
+SetNumaThreadAffinity(
+    std::thread::native_handle_type thread,
+    const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  return Status::Success;
+}
+#else
+// Use variable to make sure no NUMA related function is actually called
+// if Triton is not running with NUMA awareness. i.e. Extra docker permission
+// is needed to call the NUMA functions and this ensures backward compatibility.
+thread_local bool numa_set = false;
+
+Status
+SetNumaConfigOnThread(
+    const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  // Set thread affinity
+  RETURN_IF_ERROR(SetNumaThreadAffinity(pthread_self(), host_policy));
+
+  // Set memory policy
+  RETURN_IF_ERROR(SetNumaMemoryPolicy(host_policy));
+
+  return Status::Success;
+}
+
+Status
+SetNumaMemoryPolicy(const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  const auto it = host_policy.find("numa-node");
+  if (it != host_policy.end()) {
+    int node_id;
+    RETURN_IF_ERROR(
+        ParseIntOption("Parsing 'numa-node' value", it->second, &node_id));
+    LOG_VERBOSE(1) << "Thread is binding to NUMA node " << it->second
+                   << ". Max NUMA node count: " << (numa_max_node() + 1);
+    numa_set = true;
+    unsigned long node_mask = 1UL << node_id;
+    if (set_mempolicy(MPOL_BIND, &node_mask, (numa_max_node() + 1) + 1) != 0) {
+      return Status(
+          Status::Code::INTERNAL,
+          std::string("Unable to set NUMA memory policy: ") + strerror(errno));
+    }
+  }
+  return Status::Success;
+}
+
+Status
+GetNumaMemoryPolicyNodeMask(unsigned long* node_mask)
+{
+  *node_mask = 0;
+  int mode;
+  if (numa_set &&
+      get_mempolicy(&mode, node_mask, numa_max_node() + 1, NULL, 0) != 0) {
+    return Status(
+        Status::Code::INTERNAL,
+        std::string("Unable to get NUMA node for current thread: ") +
+            strerror(errno));
+  }
+  return Status::Success;
+}
+
+Status
+ResetNumaMemoryPolicy()
+{
+  if (numa_set && (set_mempolicy(MPOL_DEFAULT, nullptr, 0) != 0)) {
+    return Status(
+        Status::Code::INTERNAL,
+        std::string("Unable to reset NUMA memory policy: ") + strerror(errno));
+  }
+  numa_set = false;
+  return Status::Success;
+}
+
+Status
+SetNumaThreadAffinity(
+    std::thread::native_handle_type thread,
+    const triton::common::HostPolicyCmdlineConfig& host_policy)
+{
+  const auto it = host_policy.find("cpu-cores");
+  if (it != host_policy.end()) {
+    // Parse CPUs
+    std::vector<int> cpus;
+    {
+      const auto& cpu_str = it->second;
+      auto delim_cpus = cpu_str.find(",");
+      int current_pos = 0;
+      while (true) {
+        auto delim_range = cpu_str.find("-", current_pos);
+        if (delim_range == std::string::npos) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              std::string("host policy setting 'cpu-cores' format is "
+                          "'<lower_cpu_core_id>-<upper_cpu_core_id>'. Got ") +
+                  cpu_str.substr(
+                      current_pos, ((delim_cpus == std::string::npos)
+                                        ? (cpu_str.length() + 1)
+                                        : delim_cpus) -
+                                       current_pos));
+        }
+        int lower, upper;
+        RETURN_IF_ERROR(ParseIntOption(
+            "Parsing 'cpu-cores' value",
+            cpu_str.substr(current_pos, delim_range - current_pos), &lower));
+        RETURN_IF_ERROR(ParseIntOption(
+            "Parsing 'cpu-cores' value",
+            (delim_cpus == std::string::npos)
+                ? cpu_str.substr(delim_range + 1)
+                : cpu_str.substr(
+                      delim_range + 1, delim_cpus - (delim_range + 1)),
+            &upper));
+        for (; lower <= upper; ++lower) {
+          cpus.push_back(lower);
+        }
+        // break if the processed range is the last specified range
+        if (delim_cpus != std::string::npos) {
+          current_pos = delim_cpus + 1;
+          delim_cpus = cpu_str.find(",", current_pos);
+        } else {
+          break;
+        }
+      }
+    }
+
+    LOG_VERBOSE(1) << "Thread is binding to one of the CPUs: "
+                   << VectorToString(cpus);
+    numa_set = true;
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    for (int cpu : cpus) {
+      CPU_SET(cpu, &cpuset);
+    }
+    if (pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset) != 0) {
+      return Status(
+          Status::Code::INTERNAL,
+          std::string("Unable to set NUMA thread affinity: ") +
+              strerror(errno));
+    }
+  }
+  return Status::Success;
+}
+#endif
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/numa_utils.h
+++ b/3rdparty/core-r22.12/src/numa_utils.h
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <map>
+#include <thread>
+#include <vector>
+#include "status.h"
+#include "triton/common/model_config.h"
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+// Helper function to set memory policy and thread affinity on current thread
+Status SetNumaConfigOnThread(
+    const triton::common::HostPolicyCmdlineConfig& host_policy);
+
+// Restrict the memory allocation to specific NUMA node.
+Status SetNumaMemoryPolicy(
+    const triton::common::HostPolicyCmdlineConfig& host_policy);
+
+// Retrieve the node mask used to set memory policy for the current thread
+Status GetNumaMemoryPolicyNodeMask(unsigned long* node_mask);
+
+// Reset the memory allocation setting.
+Status ResetNumaMemoryPolicy();
+
+// Set a thread affinity to be on specific cpus.
+Status SetNumaThreadAffinity(
+    std::thread::native_handle_type thread,
+    const triton::common::HostPolicyCmdlineConfig& host_policy);
+
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/payload.cc
+++ b/3rdparty/core-r22.12/src/payload.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "payload.h"
+
+namespace triton { namespace core {
+
+Payload::Payload()
+    : op_type_(Operation::INFER_RUN),
+      requests_(std::vector<std::unique_ptr<InferenceRequest>>()),
+      OnCallback_([]() {}), instance_(nullptr), state_(State::UNINITIALIZED),
+      batcher_start_ns_(0), saturated_(false)
+{
+  exec_mu_.reset(new std::mutex());
+}
+
+const Status&
+Payload::MergePayload(std::shared_ptr<Payload>& payload)
+{
+  if ((payload->GetOpType() != Operation::INFER_RUN) ||
+      (op_type_ != Operation::INFER_RUN)) {
+    static Status op_type_error(
+        Status::Code::INTERNAL,
+        "Attempted to merge payloads of type that are not INFER_RUN");
+    return op_type_error;
+  }
+  if (payload->GetInstance() != instance_) {
+    static Status instance_error(
+        Status::Code::INTERNAL,
+        "Attempted to merge payloads of mismatching instance");
+    return instance_error;
+  }
+  if ((payload->GetState() != State::EXECUTING) ||
+      (state_ != State::EXECUTING)) {
+    static Status state_error(
+        Status::Code::INTERNAL,
+        "Attempted to merge payloads that are not in executing state");
+    return state_error;
+  }
+
+  // Skip comparison if not initialized (required), here assume either all
+  // payloads are initialized or otherwise.
+  if (required_equal_inputs_.Initialized() &&
+      !required_equal_inputs_.HasEqualInputs(*payload->Requests().begin())) {
+    static Status shape_error(
+        Status::Code::INVALID_ARG,
+        "Attempted to merge payloads that has non-equal inputs");
+    return shape_error;
+  }
+
+  requests_.insert(
+      requests_.end(), std::make_move_iterator(payload->Requests().begin()),
+      std::make_move_iterator(payload->Requests().end()));
+
+  payload->Callback();
+
+  return Status::Success;
+}
+
+void
+Payload::Reset(const Operation op_type, TritonModelInstance* instance)
+{
+  op_type_ = op_type;
+  requests_.clear();
+  OnCallback_ = []() {};
+  release_callbacks_.clear();
+  instance_ = instance;
+  state_ = State::UNINITIALIZED;
+  status_.reset(new std::promise<Status>());
+  required_equal_inputs_ = RequiredEqualInputs();
+  batcher_start_ns_ = 0;
+  saturated_ = false;
+}
+
+void
+Payload::Release()
+{
+  op_type_ = Operation::INFER_RUN;
+  requests_.clear();
+  OnCallback_ = []() {};
+  release_callbacks_.clear();
+  instance_ = nullptr;
+  state_ = State::RELEASED;
+  required_equal_inputs_ = RequiredEqualInputs();
+  batcher_start_ns_ = 0;
+  saturated_ = false;
+}
+
+size_t
+Payload::BatchSize()
+{
+  size_t batch_size = 0;
+  for (const auto& request : requests_) {
+    batch_size += std::max(1U, request->BatchSize());
+  }
+  return batch_size;
+}
+
+void
+Payload::ReserveRequests(size_t size)
+{
+  requests_.reserve(size);
+}
+
+void
+Payload::AddRequest(std::unique_ptr<InferenceRequest> request)
+{
+  if ((batcher_start_ns_ == 0) ||
+      (batcher_start_ns_ > request->BatcherStartNs())) {
+    batcher_start_ns_ = request->BatcherStartNs();
+  }
+  requests_.push_back(std::move(request));
+}
+
+void
+Payload::SetCallback(std::function<void()> OnCallback)
+{
+  OnCallback_ = OnCallback;
+}
+
+void
+Payload::SetInstance(TritonModelInstance* model_instance)
+{
+  instance_ = model_instance;
+}
+
+void
+Payload::AddInternalReleaseCallback(std::function<void()>&& callback)
+{
+  release_callbacks_.emplace_back(std::move(callback));
+}
+
+void
+Payload::MarkSaturated()
+{
+  saturated_ = true;
+}
+
+void
+Payload::SetState(Payload::State state)
+{
+  state_ = state;
+}
+
+Status
+Payload::Wait()
+{
+  return status_->get_future().get();
+}
+
+void
+Payload::Callback()
+{
+  OnCallback_();
+}
+
+void
+Payload::OnRelease()
+{
+  // Invoke the release callbacks added internally before releasing the
+  // request to user provided callback.
+  for (auto it = release_callbacks_.rbegin(); it != release_callbacks_.rend();
+       it++) {
+    (*it)();
+  }
+  release_callbacks_.clear();
+}
+
+void
+Payload::Execute(bool* should_exit)
+{
+  *should_exit = false;
+
+  Status status;
+  switch (op_type_) {
+    case Operation::INFER_RUN:
+      instance_->Schedule(std::move(requests_), OnCallback_);
+      break;
+    case Operation::INIT:
+      status = instance_->Initialize();
+      break;
+    case Operation::WARM_UP:
+      status = instance_->WarmUp();
+      break;
+    case Operation::EXIT:
+      *should_exit = true;
+  }
+
+  status_->set_value(status);
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/payload.h
+++ b/3rdparty/core-r22.12/src/payload.h
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <vector>
+
+#include "backend_model_instance.h"
+#include "infer_request.h"
+#include "scheduler_utils.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+class Payload {
+ public:
+  enum Operation { INFER_RUN = 0, INIT = 1, WARM_UP = 2, EXIT = 3 };
+  enum State {
+    UNINITIALIZED = 0,
+    READY = 1,
+    REQUESTED = 2,
+    SCHEDULED = 3,
+    EXECUTING = 4,
+    RELEASED = 5
+  };
+
+  Payload();
+  void Reset(const Operation op_type, TritonModelInstance* instance = nullptr);
+  const Status& MergePayload(std::shared_ptr<Payload>& payload);
+  Operation GetOpType() { return op_type_; }
+  std::mutex* GetExecMutex() { return exec_mu_.get(); }
+  size_t RequestCount() { return requests_.size(); }
+  size_t BatchSize();
+  void ReserveRequests(size_t size);
+  void AddRequest(std::unique_ptr<InferenceRequest> request);
+  std::vector<std::unique_ptr<InferenceRequest>>& Requests()
+  {
+    return requests_;
+  }
+  uint64_t BatcherStartNs() { return batcher_start_ns_; }
+  void SetCallback(std::function<void()> OnCallback);
+  void Callback();
+  void AddInternalReleaseCallback(std::function<void()>&& callback);
+  void OnRelease();
+  void SetInstance(TritonModelInstance* model_instance);
+  TritonModelInstance* GetInstance() { return instance_; }
+  void MarkSaturated();
+  bool IsSaturated() { return saturated_; }
+  RequiredEqualInputs* MutableRequiredEqualInputs()
+  {
+    return &required_equal_inputs_;
+  }
+
+  State GetState() { return state_; }
+  void SetState(State state);
+  void Execute(bool* should_exit);
+  Status Wait();
+  void Release();
+
+ private:
+  Operation op_type_;
+  std::vector<std::unique_ptr<InferenceRequest>> requests_;
+  std::function<void()> OnCallback_;
+  std::vector<std::function<void()>> release_callbacks_;
+  TritonModelInstance* instance_;
+  State state_;
+  std::unique_ptr<std::promise<Status>> status_;
+  std::unique_ptr<std::mutex> exec_mu_;
+  uint64_t batcher_start_ns_;
+  RequiredEqualInputs required_equal_inputs_;
+
+  bool saturated_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/pinned_memory_manager.cc
+++ b/3rdparty/core-r22.12/src/pinned_memory_manager.cc
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "pinned_memory_manager.h"
+
+#include <sstream>
+#include "numa_utils.h"
+#include "triton/common/logging.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace core {
+
+namespace {
+
+std::string
+PointerToString(void* ptr)
+{
+  std::stringstream ss;
+  ss << ptr;
+  return ss.str();
+}
+
+Status
+ParseIntOption(const std::string& msg, const std::string& arg, int* value)
+{
+  try {
+    *value = std::stoi(arg);
+  }
+  catch (const std::invalid_argument& ia) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        msg + ": Can't parse '" + arg + "' to integer");
+  }
+  return Status::Success;
+}
+
+}  // namespace
+
+std::unique_ptr<PinnedMemoryManager> PinnedMemoryManager::instance_;
+uint64_t PinnedMemoryManager::pinned_memory_byte_size_;
+
+PinnedMemoryManager::PinnedMemory::PinnedMemory(
+    void* pinned_memory_buffer, uint64_t size)
+    : pinned_memory_buffer_(pinned_memory_buffer)
+{
+  if (pinned_memory_buffer_ != nullptr) {
+    managed_pinned_memory_ = boost::interprocess::managed_external_buffer(
+        boost::interprocess::create_only_t{}, pinned_memory_buffer_, size);
+  }
+}
+
+
+PinnedMemoryManager::PinnedMemory::~PinnedMemory()
+{
+#ifdef TRITON_ENABLE_GPU
+  if (pinned_memory_buffer_ != nullptr) {
+    cudaFreeHost(pinned_memory_buffer_);
+  }
+#endif  // TRITON_ENABLE_GPU
+}
+
+PinnedMemoryManager::~PinnedMemoryManager()
+{
+  // Clean up
+  for (const auto& memory_info : memory_info_) {
+    const auto& is_pinned = memory_info.second.first;
+    if (!is_pinned) {
+      free(memory_info.first);
+    }
+  }
+}
+
+void
+PinnedMemoryManager::AddPinnedMemoryBuffer(
+    const std::shared_ptr<PinnedMemory>& pinned_memory_buffer,
+    unsigned long node_mask)
+{
+  pinned_memory_buffers_[node_mask] = pinned_memory_buffer;
+}
+
+Status
+PinnedMemoryManager::AllocInternal(
+    void** ptr, uint64_t size, TRITONSERVER_MemoryType* allocated_type,
+    bool allow_nonpinned_fallback, PinnedMemory* pinned_memory_buffer)
+{
+  auto status = Status::Success;
+  if (pinned_memory_buffer->pinned_memory_buffer_ != nullptr) {
+    std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
+    *ptr = pinned_memory_buffer->managed_pinned_memory_.allocate(
+        size, std::nothrow_t{});
+    *allocated_type = TRITONSERVER_MEMORY_CPU_PINNED;
+    if (*ptr == nullptr) {
+      status = Status(
+          Status::Code::INTERNAL, "failed to allocate pinned system memory");
+    }
+  } else {
+    status = Status(
+        Status::Code::INTERNAL,
+        "failed to allocate pinned system memory: no pinned memory pool");
+  }
+
+  bool is_pinned = true;
+  if ((!status.IsOk()) && allow_nonpinned_fallback) {
+    static bool warning_logged = false;
+    if (!warning_logged) {
+      LOG_WARNING << status.Message()
+                  << ", falling back to non-pinned system memory";
+      warning_logged = true;
+    }
+    *ptr = malloc(size);
+    *allocated_type = TRITONSERVER_MEMORY_CPU;
+    is_pinned = false;
+    if (*ptr == nullptr) {
+      status = Status(
+          Status::Code::INTERNAL,
+          "failed to allocate non-pinned system memory");
+    } else {
+      status = Status::Success;
+    }
+  }
+
+  // keep track of allocated buffer or clean up
+  {
+    std::lock_guard<std::mutex> lk(info_mtx_);
+    if (status.IsOk()) {
+      auto res = memory_info_.emplace(
+          *ptr, std::make_pair(is_pinned, pinned_memory_buffer));
+      if (!res.second) {
+        status = Status(
+            Status::Code::INTERNAL, "unexpected memory address collision, '" +
+                                        PointerToString(*ptr) +
+                                        "' has been managed");
+      }
+      LOG_VERBOSE(1) << (is_pinned ? "" : "non-")
+                     << "pinned memory allocation: "
+                     << "size " << size << ", addr " << *ptr;
+    }
+  }
+
+  if ((!status.IsOk()) && (*ptr != nullptr)) {
+    if (is_pinned) {
+      std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
+      pinned_memory_buffer->managed_pinned_memory_.deallocate(*ptr);
+    } else {
+      free(*ptr);
+    }
+  }
+
+  return status;
+}
+
+Status
+PinnedMemoryManager::FreeInternal(void* ptr)
+{
+  bool is_pinned = true;
+  PinnedMemory* pinned_memory_buffer = nullptr;
+  {
+    std::lock_guard<std::mutex> lk(info_mtx_);
+    auto it = memory_info_.find(ptr);
+    if (it != memory_info_.end()) {
+      is_pinned = it->second.first;
+      pinned_memory_buffer = it->second.second;
+      LOG_VERBOSE(1) << (is_pinned ? "" : "non-")
+                     << "pinned memory deallocation: "
+                     << "addr " << ptr;
+      memory_info_.erase(it);
+    } else {
+      return Status(
+          Status::Code::INTERNAL, "unexpected memory address '" +
+                                      PointerToString(ptr) +
+                                      "' is not being managed");
+    }
+  }
+
+  if (is_pinned) {
+    std::lock_guard<std::mutex> lk(pinned_memory_buffer->buffer_mtx_);
+    pinned_memory_buffer->managed_pinned_memory_.deallocate(ptr);
+  } else {
+    free(ptr);
+  }
+  return Status::Success;
+}
+
+void
+PinnedMemoryManager::Reset()
+{
+  instance_.reset();
+}
+
+Status
+PinnedMemoryManager::Create(const Options& options)
+{
+  if (instance_ != nullptr) {
+    LOG_WARNING << "New pinned memory pool of size "
+                << options.pinned_memory_pool_byte_size_
+                << " could not be created since one already exists"
+                << " of size " << pinned_memory_byte_size_;
+    return Status::Success;
+  }
+
+  instance_.reset(new PinnedMemoryManager());
+  if (options.host_policy_map_.empty()) {
+    void* buffer = nullptr;
+#ifdef TRITON_ENABLE_GPU
+    auto err = cudaHostAlloc(
+        &buffer, options.pinned_memory_pool_byte_size_, cudaHostAllocPortable);
+    if (err != cudaSuccess) {
+      buffer = nullptr;
+      LOG_WARNING << "Unable to allocate pinned system memory, pinned memory "
+                     "pool will not be available: "
+                  << std::string(cudaGetErrorString(err));
+    } else if (options.pinned_memory_pool_byte_size_ != 0) {
+      LOG_INFO << "Pinned memory pool is created at '"
+               << PointerToString(buffer) << "' with size "
+               << options.pinned_memory_pool_byte_size_;
+    } else {
+      LOG_INFO << "Pinned memory pool disabled";
+    }
+#endif  // TRITON_ENABLE_GPU
+    try {
+      instance_->AddPinnedMemoryBuffer(
+          std::shared_ptr<PinnedMemory>(
+              new PinnedMemory(buffer, options.pinned_memory_pool_byte_size_)),
+          0);
+    }
+    catch (const std::exception& ex) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Failed to add Pinned Memory buffer: " + std::string(ex.what()));
+    }
+  } else {
+    // Create only one buffer / manager should be created for one node,
+    // and all associated devices should request memory from the shared manager
+    std::map<int32_t, std::string> numa_map;
+    for (const auto host_policy : options.host_policy_map_) {
+      const auto numa_it = host_policy.second.find("numa-node");
+      if (numa_it != host_policy.second.end()) {
+        int32_t numa_id;
+        if (ParseIntOption("Parsing NUMA node", numa_it->second, &numa_id)
+                .IsOk()) {
+          numa_map.emplace(numa_id, host_policy.first);
+        }
+      }
+    }
+    for (const auto node_policy : numa_map) {
+      auto status =
+          SetNumaMemoryPolicy(options.host_policy_map_.at(node_policy.second));
+      if (!status.IsOk()) {
+        LOG_WARNING << "Unable to allocate pinned system memory for NUMA node "
+                    << node_policy.first << ": " << status.AsString();
+        continue;
+      }
+      unsigned long node_mask;
+      status = GetNumaMemoryPolicyNodeMask(&node_mask);
+      if (!status.IsOk()) {
+        LOG_WARNING << "Unable to get NUMA node set for current thread: "
+                    << status.AsString();
+        continue;
+      }
+      void* buffer = nullptr;
+#ifdef TRITON_ENABLE_GPU
+      auto err = cudaHostAlloc(
+          &buffer, options.pinned_memory_pool_byte_size_,
+          cudaHostAllocPortable);
+      if (err != cudaSuccess) {
+        buffer = nullptr;
+        LOG_WARNING << "Unable to allocate pinned system memory, pinned memory "
+                       "pool will not be available: "
+                    << std::string(cudaGetErrorString(err));
+      } else if (options.pinned_memory_pool_byte_size_ != 0) {
+        LOG_INFO << "Pinned memory pool is created at '"
+                 << PointerToString(buffer) << "' with size "
+                 << options.pinned_memory_pool_byte_size_;
+      } else {
+        LOG_INFO << "Pinned memory pool disabled";
+      }
+#endif  // TRITON_ENABLE_GPU
+      ResetNumaMemoryPolicy();
+      try {
+        instance_->AddPinnedMemoryBuffer(
+            std::shared_ptr<PinnedMemory>(new PinnedMemory(
+                buffer, options.pinned_memory_pool_byte_size_)),
+            node_mask);
+      }
+      catch (const std::exception& ex) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Failed to add Pinned Memory buffer with host policy: " +
+                std::string(ex.what()));
+      }
+    }
+    // If no pinned memory is allocated, add an empty entry where all allocation
+    // will be on normal system memory
+    if (instance_->pinned_memory_buffers_.empty()) {
+      try {
+        instance_->AddPinnedMemoryBuffer(
+            std::shared_ptr<PinnedMemory>(new PinnedMemory(
+                nullptr, options.pinned_memory_pool_byte_size_)),
+            0);
+      }
+      catch (const std::exception& ex) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Failed to add empty Pinned Memory entry: " +
+                std::string(ex.what()));
+      }
+    }
+  }
+  pinned_memory_byte_size_ = options.pinned_memory_pool_byte_size_;
+  return Status::Success;
+}
+
+Status
+PinnedMemoryManager::Alloc(
+    void** ptr, uint64_t size, TRITONSERVER_MemoryType* allocated_type,
+    bool allow_nonpinned_fallback)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "PinnedMemoryManager has not been created");
+  }
+
+  auto pinned_memory_buffer =
+      instance_->pinned_memory_buffers_.begin()->second.get();
+  if (instance_->pinned_memory_buffers_.size() > 1) {
+    unsigned long node_mask;
+    if (GetNumaMemoryPolicyNodeMask(&node_mask).IsOk()) {
+      auto it = instance_->pinned_memory_buffers_.find(node_mask);
+      if (it != instance_->pinned_memory_buffers_.end()) {
+        pinned_memory_buffer = it->second.get();
+      }
+    }
+  }
+
+  return instance_->AllocInternal(
+      ptr, size, allocated_type, allow_nonpinned_fallback,
+      pinned_memory_buffer);
+}
+
+Status
+PinnedMemoryManager::Free(void* ptr)
+{
+  if (instance_ == nullptr) {
+    return Status(
+        Status::Code::UNAVAILABLE, "PinnedMemoryManager has not been created");
+  }
+
+  return instance_->FreeInternal(ptr);
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/pinned_memory_manager.h
+++ b/3rdparty/core-r22.12/src/pinned_memory_manager.h
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#include <boost/interprocess/managed_external_buffer.hpp>
+#include <map>
+#include <memory>
+#include <mutex>
+#include "status.h"
+#include "triton/common/model_config.h"
+
+namespace triton { namespace core {
+
+// This is a singleton class responsible for maintaining pinned memory pool
+// used by the inference server. Pinned memory allocations and deallocations
+// must be requested via functions provided by this class.
+class PinnedMemoryManager {
+ public:
+  // Options to configure pinned memeory manager.
+  struct Options {
+    Options(
+        uint64_t b = 0,
+        const triton::common::HostPolicyCmdlineConfigMap& host_policy_map = {})
+        : pinned_memory_pool_byte_size_(b), host_policy_map_(host_policy_map)
+    {
+    }
+
+    uint64_t pinned_memory_pool_byte_size_;
+    triton::common::HostPolicyCmdlineConfigMap host_policy_map_;
+  };
+
+  ~PinnedMemoryManager();
+
+  // Create the pinned memory manager based on 'options' specified.
+  // Return Status object indicating success or failure.
+  static Status Create(const Options& options);
+
+  // Allocate pinned memory with the requested 'size' and return the pointer
+  // in 'ptr'. If 'allow_nonpinned_fallback' is true, regular system memory
+  // will be allocated as fallback in the case where pinned memory fails to
+  // be allocated.
+  // Return Status object indicating success or failure.
+  static Status Alloc(
+      void** ptr, uint64_t size, TRITONSERVER_MemoryType* allocated_type,
+      bool allow_nonpinned_fallback);
+
+  // Free the memory allocated by the pinned memory manager.
+  // Return Status object indicating success or failure.
+  static Status Free(void* ptr);
+
+ protected:
+  // Provide explicit control on the lifecycle of the CUDA memory manager,
+  // for testing only.
+  static void Reset();
+
+ private:
+  class PinnedMemory {
+   public:
+    PinnedMemory(void* pinned_memory_buffer, uint64_t size);
+    ~PinnedMemory();
+    void* pinned_memory_buffer_;
+    std::mutex buffer_mtx_;
+    boost::interprocess::managed_external_buffer managed_pinned_memory_;
+  };
+
+  PinnedMemoryManager() = default;
+
+  Status AllocInternal(
+      void** ptr, uint64_t size, TRITONSERVER_MemoryType* allocated_type,
+      bool allow_nonpinned_fallback, PinnedMemory* pinned_memory_buffer);
+  Status FreeInternal(void* ptr);
+  void AddPinnedMemoryBuffer(
+      const std::shared_ptr<PinnedMemory>& pinned_memory_buffer,
+      unsigned long node_mask);
+
+  static std::unique_ptr<PinnedMemoryManager> instance_;
+  static uint64_t pinned_memory_byte_size_;
+
+  std::mutex info_mtx_;
+  std::map<void*, std::pair<bool, PinnedMemory*>> memory_info_;
+  std::map<unsigned long, std::shared_ptr<PinnedMemory>> pinned_memory_buffers_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/rate_limiter.cc
+++ b/3rdparty/core-r22.12/src/rate_limiter.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "rate_limiter.h"
+
+#include <limits>
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+constexpr size_t MAX_PAYLOAD_BUCKET_COUNT = 1000;
+
+//=========================================================================
+//  Core Implementation
+//=========================================================================
+
+Status
+RateLimiter::Create(
+    const bool ignore_resources_and_priority,
+    const RateLimiter::ResourceMap& resource_map,
+    std::unique_ptr<RateLimiter>* rate_limiter)
+{
+  std::unique_ptr<RateLimiter> local_rate_limiter(
+      new RateLimiter(ignore_resources_and_priority, resource_map));
+  *rate_limiter = std::move(local_rate_limiter);
+
+  return Status::Success;
+}
+
+Status
+RateLimiter::RegisterModelInstance(
+    TritonModelInstance* triton_model_instance,
+    const RateLimiterConfig& rate_limiter_config)
+{
+  {
+    std::lock_guard<std::mutex> lk1(model_ctx_mtx_);
+    std::lock_guard<std::mutex> lk2(model_instance_ctx_mtx_);
+
+    auto& model_context = model_contexts_[triton_model_instance->Model()];
+    auto& model_instances =
+        model_instance_ctxs_[triton_model_instance->Model()];
+
+    model_instances.push_back(
+        std::shared_ptr<ModelInstanceContext>(new ModelInstanceContext(
+            triton_model_instance, &model_context, rate_limiter_config,
+            [this](ModelInstanceContext* instance) { OnStage(instance); },
+            [this](ModelInstanceContext* instance) { OnRelease(instance); })));
+    model_context.AddAvailableInstance(model_instances.back().get());
+    model_context.AddSpecificRequestQueue();
+
+    if (!ignore_resources_and_priority_) {
+      resource_manager_->AddModelInstance(model_instances.back().get());
+      RETURN_IF_ERROR(resource_manager_->UpdateResourceLimits());
+    }
+  }
+
+  InitializePayloadQueues(triton_model_instance);
+
+  return Status::Success;
+}
+
+Status
+RateLimiter::UnregisterModel(const TritonModel* model)
+{
+  {
+    std::lock_guard<std::mutex> lk1(model_ctx_mtx_);
+    std::lock_guard<std::mutex> lk2(model_instance_ctx_mtx_);
+
+    auto& model_context = model_contexts_[model];
+
+    model_context.RequestRemoval();
+    for (const auto& instance : model_instance_ctxs_[model]) {
+      instance->WaitForRemoval();
+      if (!ignore_resources_and_priority_) {
+        resource_manager_->RemoveModelInstance(instance.get());
+      }
+    }
+
+    model_instance_ctxs_.erase(model);
+    model_contexts_.erase(model);
+  }
+
+  if (!ignore_resources_and_priority_) {
+    RETURN_IF_ERROR(resource_manager_->UpdateResourceLimits());
+  }
+
+  {
+    std::lock_guard<std::mutex> lk(payload_queues_mu_);
+    if (payload_queues_.find(model) != payload_queues_.end()) {
+      payload_queues_.erase(model);
+    }
+  }
+
+  return Status::Success;
+}
+
+bool
+RateLimiter::PayloadSlotAvailable(const TritonModel* model)
+{
+  bool result;
+  PayloadQueue* payload_queue = payload_queues_[model].get();
+  {
+    std::lock_guard<std::mutex> lk(payload_queue->mu_);
+    result = payload_queue->queue_->Size() <
+             2 * payload_queue->specific_queues_.size();
+  }
+  return result;
+}
+
+Status
+RateLimiter::EnqueuePayload(
+    const TritonModel* model, std::shared_ptr<Payload> payload)
+{
+  auto pinstance = payload->GetInstance();
+  if (payload_queues_.find(model) == payload_queues_.end()) {
+    LOG_INFO << "Should not print this ";
+  }
+  PayloadQueue* payload_queue = payload_queues_[model].get();
+  {
+    std::lock_guard<std::mutex> lk(payload_queue->mu_);
+    payload->SetState(Payload::State::REQUESTED);
+    if (ignore_resources_and_priority_) {
+      SchedulePayload(pinstance, payload_queue, payload);
+    }
+  }
+  if (ignore_resources_and_priority_) {
+    if (pinstance == nullptr) {
+      payload_queue->cv_.notify_one();
+    } else {
+      payload_queue->cv_.notify_all();
+    }
+  } else {
+    StandardScheduleFunc sched_func = [this, payload_queue,
+                                       payload](ModelInstanceContext* mi) {
+      {
+        std::lock_guard<std::mutex> lk(payload_queue->mu_);
+        this->SchedulePayload(mi->RawInstance(), payload_queue, payload);
+      }
+      auto cb = [mi]() { mi->Release(); };
+      payload->AddInternalReleaseCallback(cb);
+      if (mi->RawInstance() == nullptr) {
+        payload_queue->cv_.notify_one();
+      } else {
+        payload_queue->cv_.notify_all();
+      }
+    };
+    DeferPayloadSchedule(sched_func, model, payload->GetInstance());
+  }
+  return Status::Success;
+}
+
+void
+RateLimiter::DequeuePayload(
+    std::deque<TritonModelInstance*>& instances,
+    std::shared_ptr<Payload>* payload)
+{
+  payload->reset();
+  if (payload_queues_.find(instances[0]->Model()) == payload_queues_.end()) {
+    LOG_INFO << "Should not print this ";
+  }
+  PayloadQueue* payload_queue = payload_queues_[instances[0]->Model()].get();
+  std::vector<std::shared_ptr<Payload>> merged_payloads;
+  size_t instance_index = std::numeric_limits<std::size_t>::max();
+  {
+    std::unique_lock<std::mutex> lk(payload_queue->mu_);
+    payload_queue->cv_.wait(lk, [&instances, &instance_index, payload_queue]() {
+      bool empty = payload_queue->queue_->Empty();
+      if (empty) {
+        instance_index = 0;
+        for (const auto instance : instances) {
+          empty = payload_queue->specific_queues_[instance]->Empty();
+          if (empty) {
+            instance_index++;
+          } else {
+            break;
+          }
+        }
+      }
+      return !empty;
+    });
+    if (instance_index < instances.size()) {
+      TritonModelInstance* instance = instances[instance_index];
+      if (!payload_queue->specific_queues_[instance]->Empty()) {
+        payload_queue->specific_queues_[instance]->Dequeue(
+            payload, &merged_payloads);
+      }
+    } else {
+      payload_queue->queue_->Dequeue(payload, &merged_payloads);
+    }
+  }
+  for (auto& merge_payload : merged_payloads) {
+    PayloadRelease(merge_payload);
+  }
+  (*payload)->Callback();
+  if ((*payload)->GetInstance() == nullptr) {
+    (*payload)->SetInstance(instances.front());
+    instances.pop_front();
+  } else {
+    instances.erase(instances.begin() + instance_index);
+  }
+}
+
+std::shared_ptr<Payload>
+RateLimiter::GetPayload(
+    const Payload::Operation op_type, TritonModelInstance* instance)
+{
+  std::shared_ptr<Payload> payload;
+
+  if (max_payload_bucket_count_ > 0) {
+    std::lock_guard<std::mutex> lock(payload_mu_);
+
+    if (!payload_bucket_.empty()) {
+      payload = payload_bucket_.back();
+      payload_bucket_.pop_back();
+    }
+    if (payload.get() == nullptr && (!payloads_in_use_.empty())) {
+      // Just checking the front of the queue instead the entire queue for
+      // an available payload to save time.
+      if (payloads_in_use_.front().use_count() == 1) {
+        payload = payloads_in_use_.front();
+        payloads_in_use_.pop_front();
+      }
+    }
+  }
+
+  if (payload.get() == nullptr) {
+    payload.reset(new Payload());
+  }
+
+  payload->Reset(op_type, instance);
+  return payload;
+}
+
+void
+RateLimiter::PayloadRelease(std::shared_ptr<Payload>& payload)
+{
+  payload->OnRelease();
+  if (max_payload_bucket_count_ > 0) {
+    std::lock_guard<std::mutex> lock(payload_mu_);
+
+    if (payloads_in_use_.size() + payload_bucket_.size() <
+        max_payload_bucket_count_) {
+      // Release iff the payload shared_ptr is uniquely held.
+      if (payload.use_count() == 1) {
+        payload->Release();
+        payload_bucket_.push_back(std::move(payload));
+        return;
+      } else {
+        payloads_in_use_.push_back(std::move(payload));
+      }
+    }
+  }
+}
+
+RateLimiter::RateLimiter(
+    const bool ignore_resources_and_priority, const ResourceMap& resource_map)
+    : ignore_resources_and_priority_(ignore_resources_and_priority),
+      max_payload_bucket_count_(MAX_PAYLOAD_BUCKET_COUNT)
+{
+  ResourceManager::Create(resource_map, &resource_manager_);
+}
+
+void
+RateLimiter::InitializePayloadQueues(const TritonModelInstance* instance)
+{
+  auto& config = instance->Model()->Config();
+  uint64_t max_queue_delay_microseconds;
+  if (config.has_sequence_batching()) {
+    const auto& batcher_config = config.sequence_batching();
+    if (batcher_config.has_oldest()) {
+      max_queue_delay_microseconds =
+          batcher_config.oldest().max_queue_delay_microseconds();
+    } else {
+      max_queue_delay_microseconds = 0;
+    }
+  } else if (config.has_dynamic_batching()) {
+    max_queue_delay_microseconds =
+        config.dynamic_batching().max_queue_delay_microseconds();
+  } else {
+    max_queue_delay_microseconds = 0;
+  }
+  {
+    std::lock_guard<std::mutex> lk(payload_queues_mu_);
+    if (payload_queues_.find(instance->Model()) == payload_queues_.end()) {
+      payload_queues_.emplace(
+          instance->Model(),
+          new PayloadQueue(
+              config.max_batch_size(), max_queue_delay_microseconds * 1000));
+    }
+  }
+  PayloadQueue* payload_queue = payload_queues_[instance->Model()].get();
+  if (payload_queue->specific_queues_.find(instance) ==
+      payload_queue->specific_queues_.end()) {
+    payload_queue->specific_queues_.emplace(
+        instance,
+        new InstanceQueue(
+            config.max_batch_size(), max_queue_delay_microseconds * 1000));
+  }
+}
+
+Status
+RateLimiter::DeferPayloadSchedule(
+    const StandardScheduleFunc& OnSchedule, const TritonModel* model,
+    TritonModelInstance* triton_model_instance)
+{
+  std::lock_guard<std::mutex> lk(model_ctx_mtx_);
+
+  auto itr = model_contexts_.find(model);
+  if (itr == model_contexts_.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Requested model is not yet registered with rate limiter");
+  }
+
+  if (itr->second.isRemovalInProgress()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "New model requests can not be made to a model that is being "
+        "removed");
+  }
+
+  itr->second.EnqueueModelInstanceRequest(OnSchedule, triton_model_instance);
+  itr->second.StageInstanceIfAvailable(triton_model_instance);
+
+  return Status::Success;
+}
+
+void
+RateLimiter::SchedulePayload(
+    TritonModelInstance* tmi, PayloadQueue* payload_queue,
+    const std::shared_ptr<Payload>& payload)
+{
+  if (tmi == nullptr) {
+    payload_queue->queue_->Enqueue(payload);
+  } else {
+    payload_queue->specific_queues_[tmi]->Enqueue(payload);
+  }
+  payload->SetState(Payload::State::SCHEDULED);
+}
+
+void
+RateLimiter::OnStage(ModelInstanceContext* instance)
+{
+  {
+    std::lock_guard<std::recursive_mutex> lk(staged_instances_mtx_);
+    staged_instances_.push(instance);
+  }
+  AttemptAllocation();
+}
+
+void
+RateLimiter::OnRelease(ModelInstanceContext* instance)
+{
+  auto& model_context = model_contexts_[instance->RawInstance()->Model()];
+  model_context.AddAvailableInstance(instance);
+  resource_manager_->ReleaseResources(instance);
+  if (model_context.ContainsPendingRequests(instance->RawInstance()->Index())) {
+    model_context.StageInstanceIfAvailable(instance->RawInstance());
+  }
+  AttemptAllocation();
+}
+
+void
+RateLimiter::AttemptAllocation()
+{
+  std::lock_guard<std::recursive_mutex> lk(staged_instances_mtx_);
+  if (!staged_instances_.empty()) {
+    ModelInstanceContext* instance = staged_instances_.top();
+    if (resource_manager_->AllocateResources(instance)) {
+      staged_instances_.pop();
+      instance->Allocate();
+    }
+  }
+}
+
+//=========================================================================
+//  ModelContext Implementation
+//=========================================================================
+
+RateLimiter::ModelContext::ModelContext() : removal_in_progress_(false) {}
+
+Status
+RateLimiter::ModelContext::EnqueueModelInstanceRequest(
+    const StandardScheduleFunc& OnSchedule,
+    TritonModelInstance* triton_model_instance)
+{
+  std::lock_guard<std::recursive_mutex> lk(sched_request_queue_mtx_);
+
+  if (triton_model_instance == nullptr) {
+    generic_sched_request_queue_.push(OnSchedule);
+  } else if (
+      (uint32_t)triton_model_instance->Index() <
+      specific_sched_request_queues_.size()) {
+    specific_sched_request_queues_[triton_model_instance->Index()].push(
+        OnSchedule);
+  } else {
+    return Status(
+        Status::Code::INTERNAL,
+        "expected instance index between 0 and " +
+            std::to_string(specific_sched_request_queues_.size()) + ", got " +
+            std::to_string(triton_model_instance->Index()));
+  }
+
+  return Status::Success;
+}
+
+void
+RateLimiter::ModelContext::AddAvailableInstance(ModelInstanceContext* instance)
+{
+  std::lock_guard<std::recursive_mutex> lk(avbl_instances_mtx_);
+  avbl_instances_.push(instance);
+  instance->MarkAvailable();
+}
+
+
+void
+RateLimiter::ModelContext::StageInstanceIfAvailable(
+    TritonModelInstance* req_instance)
+{
+  std::lock_guard<std::recursive_mutex> lk1(sched_request_queue_mtx_);
+  std::lock_guard<std::recursive_mutex> lk2(avbl_instances_mtx_);
+  PriorityQueue backup_queue;
+
+  while (!avbl_instances_.empty()) {
+    ModelInstanceContext* instance = avbl_instances_.top();
+    if ((req_instance != nullptr) &&
+        (instance->RawInstance() != req_instance)) {
+      backup_queue.push(instance);
+      avbl_instances_.pop();
+      continue;
+    }
+    if (!specific_sched_request_queues_[instance->RawInstance()->Index()]
+             .empty()) {
+      // Prioritize the specific requests for the available model
+      // instance highest priority.
+      const StandardScheduleFunc func =
+          specific_sched_request_queues_[instance->RawInstance()->Index()]
+              .front();
+      specific_sched_request_queues_[instance->RawInstance()->Index()].pop();
+      instance->Stage(func);
+    } else if (!generic_sched_request_queue_.empty()) {
+      // If request is for generic model instance then use the
+      // instance with the highest priority.
+      const StandardScheduleFunc func = generic_sched_request_queue_.front();
+      generic_sched_request_queue_.pop();
+      instance->Stage(func);
+    } else {
+      // If there are requests for a specific model instance then backup
+      // the model instance and keep searching through the available
+      // model instances. The prioritization will be taken care of in the
+      // staging priority queue.
+      backup_queue.push(instance);
+    }
+    avbl_instances_.pop();
+  }
+  // Restore the backup queue
+  if (!backup_queue.empty()) {
+    avbl_instances_.swap(backup_queue);
+  }
+}
+
+void
+RateLimiter::ModelContext::AllocateInstanceIfAvailable()
+{
+  std::lock_guard<std::recursive_mutex> lk1(sched_request_queue_mtx_);
+  std::lock_guard<std::recursive_mutex> lk2(avbl_instances_mtx_);
+  PriorityQueue backup_queue;
+  while (!avbl_instances_.empty()) {
+    ModelInstanceContext* instance = avbl_instances_.top();
+    if (!specific_sched_request_queues_[instance->RawInstance()->Index()]
+             .empty()) {
+      // Prioritize the specific requests for the available model
+      // instance highest priority.
+      const StandardScheduleFunc func =
+          specific_sched_request_queues_[instance->RawInstance()->Index()]
+              .front();
+      specific_sched_request_queues_[instance->RawInstance()->Index()].pop();
+      instance->DirectAllocate(func);
+    } else if (!generic_sched_request_queue_.empty()) {
+      // If request is for generic model instance then use the
+      // instance with the highest priority.
+      const StandardScheduleFunc func = generic_sched_request_queue_.front();
+      generic_sched_request_queue_.pop();
+      instance->DirectAllocate(func);
+    } else {
+      // If there are requests for a specific model instance then backup
+      // the model instance and keep searching through the available
+      // model instances. The prioritization will be taken care of in the
+      // staging priority queue.
+      backup_queue.push(instance);
+    }
+    avbl_instances_.pop();
+  }
+  // Restore the backup queue
+  if (!backup_queue.empty()) {
+    avbl_instances_.swap(backup_queue);
+  }
+}
+
+void
+RateLimiter::ModelContext::AddSpecificRequestQueue()
+{
+  std::lock_guard<std::recursive_mutex> lk(sched_request_queue_mtx_);
+  specific_sched_request_queues_.emplace_back();
+}
+
+bool
+RateLimiter::ModelContext::ContainsPendingRequests(int index)
+{
+  std::lock_guard<std::recursive_mutex> lk(sched_request_queue_mtx_);
+  return (generic_sched_request_queue_.size() != 0) ||
+         (specific_sched_request_queues_[index].size() != 0);
+}
+
+void
+RateLimiter::ModelContext::RequestRemoval()
+{
+  removal_in_progress_ = true;
+}
+
+
+//=========================================================================
+//  ModelInstanceContext Implementation
+//=========================================================================
+
+RateLimiter::ModelInstanceContext::ModelInstanceContext(
+    TritonModelInstance* triton_model_instance,
+    RateLimiter::ModelContext* model_context,
+    const RateLimiter::RateLimiterConfig& rate_limiter_config,
+    RateLimiter::StandardStageFunc OnStage,
+    RateLimiter::StandardReleaseFunc OnRelease)
+    : triton_model_instance_(triton_model_instance),
+      index_(triton_model_instance->Index()), model_context_(model_context),
+      rate_limiter_config_(rate_limiter_config), OnStage_(OnStage),
+      OnRelease_(OnRelease), exec_count_(0), state_(AVAILABLE)
+{
+}
+
+void
+RateLimiter::ModelInstanceContext::MarkAvailable()
+{
+  std::lock_guard<std::mutex> lk(state_mtx_);
+  state_ = AVAILABLE;
+}
+
+Status
+RateLimiter::ModelInstanceContext::Stage(StandardScheduleFunc OnSchedule)
+{
+  {
+    std::lock_guard<std::mutex> lk(state_mtx_);
+
+    if (state_ != AVAILABLE) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Can not stage a model instance that is not yet available");
+    }
+
+    state_ = STAGED;
+    OnSchedule_ = OnSchedule;
+  }
+
+  OnStage_(this);
+
+  return Status::Success;
+}
+
+Status
+RateLimiter::ModelInstanceContext::Allocate()
+{
+  {
+    std::lock_guard<std::mutex> lk(state_mtx_);
+
+    if (state_ != STAGED) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Can not allocate a model instance that is not yet staged");
+    }
+
+    state_ = ALLOCATED;
+  }
+
+  OnSchedule_(this);
+
+  return Status::Success;
+}
+
+Status
+RateLimiter::ModelInstanceContext::DirectAllocate(
+    StandardScheduleFunc OnSchedule)
+{
+  {
+    std::lock_guard<std::mutex> lk(state_mtx_);
+
+    if (state_ != AVAILABLE) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Can not allocate a model instance that is not yet available");
+    }
+
+    state_ = ALLOCATED;
+  }
+
+  OnSchedule(this);
+
+  return Status::Success;
+}
+
+void
+RateLimiter::ModelInstanceContext::Release()
+{
+  exec_count_++;
+
+  OnRelease_(this);
+
+  {
+    std::lock_guard<std::mutex> lk(state_mtx_);
+    if ((model_context_->isRemovalInProgress()) && (state_ == AVAILABLE) &&
+        (!model_context_->ContainsPendingRequests(index_))) {
+      state_ = REMOVED;
+    }
+  }
+
+  if (state_ == REMOVED) {
+    cv_.notify_all();
+  }
+}
+
+void
+RateLimiter::ModelInstanceContext::RequestRemoval()
+{
+  std::lock_guard<std::mutex> lk(state_mtx_);
+
+  if ((state_ == AVAILABLE) &&
+      (!model_context_->ContainsPendingRequests(index_))) {
+    state_ = REMOVED;
+  }
+}
+
+void
+RateLimiter::ModelInstanceContext::WaitForRemoval()
+{
+  if (!model_context_->isRemovalInProgress()) {
+    model_context_->RequestRemoval();
+  }
+
+  RequestRemoval();
+
+  // Wait for the instance to be removed
+  {
+    std::unique_lock<std::mutex> lk(state_mtx_);
+    cv_.wait(lk, [this] { return state_ == REMOVED; });
+  }
+}
+
+double
+RateLimiter::ModelInstanceContext::ScaledPriority()
+{
+  // TODO: Different schemes for the prioritization of
+  // model instance can be added here.
+  // The priority of instance is 1 by default. If specified
+  // as 0, the priority is still treated as 1.
+  auto priority = std::max(rate_limiter_config_.priority(), 1u);
+  return (exec_count_ * priority);
+}
+
+
+//=========================================================================
+//  ResourceManager Implementation
+//=========================================================================
+
+Status
+RateLimiter::ResourceManager::Create(
+    const ResourceMap& resource_map,
+    std::unique_ptr<ResourceManager>* resource_manager)
+{
+  std::unique_ptr<ResourceManager> local_resource_manager(
+      new ResourceManager(resource_map));
+  *resource_manager = std::move(local_resource_manager);
+  return Status::Success;
+}
+
+void
+RateLimiter::ResourceManager::AddModelInstance(
+    const ModelInstanceContext* instance)
+{
+  std::lock_guard<std::mutex> lk(model_resources_mtx_);
+  auto pr = model_resources_.emplace(std::make_pair(instance, ResourceMap()));
+  for (const auto& resource : instance->GetRateLimiterConfig()->resources()) {
+    if (resource.global()) {
+      (pr.first->second[GLOBAL_RESOURCE_KEY])[resource.name()] =
+          resource.count();
+    } else {
+      (pr.first->second[instance->RawInstance()->DeviceId()])[resource.name()] =
+          resource.count();
+    }
+  }
+}
+
+Status
+RateLimiter::ResourceManager::RemoveModelInstance(
+    const ModelInstanceContext* instance)
+{
+  std::lock_guard<std::mutex> lk(model_resources_mtx_);
+  const auto& itr = model_resources_.find(instance);
+  if (itr == model_resources_.end()) {
+    return Status(
+        Status::Code::INTERNAL, "Can not find the instance to remove");
+  }
+  model_resources_.erase(instance);
+  return Status::Success;
+}
+
+Status
+RateLimiter::ResourceManager::UpdateResourceLimits()
+{
+  std::lock_guard<std::mutex> lk1(max_resources_mtx_);
+  std::lock_guard<std::mutex> lk2(model_resources_mtx_);
+  max_resources_.clear();
+  // Obtain the maximum resource across all the instances
+  // and use it as the default available.
+  for (const auto& instance_resources : model_resources_) {
+    for (const auto& resource_device_map : instance_resources.second) {
+      auto ditr = max_resources_.find(resource_device_map.first);
+      if (ditr == max_resources_.end()) {
+        ditr =
+            max_resources_
+                .emplace(resource_device_map.first, resource_device_map.second)
+                .first;
+      } else {
+        for (const auto resource : resource_device_map.second) {
+          auto ritr = ditr->second.find(resource.first);
+          if (ritr == ditr->second.end()) {
+            ritr = ditr->second.emplace(resource.first, resource.second).first;
+          } else {
+            if (ritr->second < resource.second) {
+              ritr->second = resource.second;
+            }
+          }
+        }
+      }
+    }
+  }
+  if (!explicit_max_resources_.empty()) {
+    RETURN_IF_ERROR(ParseAndValidateExplicitResources());
+  }
+  RETURN_IF_ERROR(ValidateMaxResources());
+
+  if (LOG_VERBOSE_IS_ON(1)) {
+    std::string resource_map_str{"\nMax Resource Map===>\n"};
+    for (const auto& ditr : max_resources_) {
+      if (!ditr.second.empty()) {
+        std::string device_str{(ditr.first == GLOBAL_RESOURCE_KEY)
+                                   ? "GLOBAL"
+                                   : std::to_string(ditr.first)};
+        resource_map_str += "\tDevice: " + device_str + "\n";
+        for (const auto& ritr : ditr.second) {
+          resource_map_str += "\t\tResource: " + ritr.first +
+                              "\t Count: " + std::to_string(ritr.second) + "\n";
+        }
+      }
+    }
+    LOG_VERBOSE(1) << resource_map_str;
+  }
+
+  return Status::Success;
+}
+
+Status
+RateLimiter::ResourceManager::ValidateMaxResources()
+{
+  for (const auto& global_resource : max_resources_[GLOBAL_RESOURCE_KEY]) {
+    for (const auto& ditr : max_resources_) {
+      if (ditr.first != GLOBAL_RESOURCE_KEY) {
+        for (const auto& ritr : ditr.second) {
+          if (global_resource.first.compare(ritr.first) == 0) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                (std::string("Resource \"") + ritr.first +
+                 "\" is present as both global and device-specific resource in "
+                 "the model configuration.")
+                    .c_str());
+          }
+        }
+      }
+    }
+  }
+  return Status::Success;
+}
+
+Status
+RateLimiter::ResourceManager::ParseAndValidateExplicitResources()
+{
+  for (auto& ditr : max_resources_) {
+    for (auto& ritr : ditr.second) {
+      // If not specified explicitly, consider the resource to be unavailable.
+      size_t resource_count = 0;
+      if (ditr.first == GLOBAL_RESOURCE_KEY) {
+        // Ignore the device specification... will search for all resources in
+        // the map...
+        for (const auto& exp_ditr : explicit_max_resources_) {
+          for (const auto& exp_ritr : exp_ditr.second) {
+            if (ritr.first.compare(exp_ritr.first) == 0) {
+              if (resource_count < exp_ritr.second) {
+                resource_count = exp_ritr.second;
+              }
+            }
+          }
+        }
+      } else {
+        // Search only for the device specific or per-device resources...
+        // device-specific
+        for (const auto& exp_ritr : explicit_max_resources_[ditr.first]) {
+          if (ritr.first.compare(exp_ritr.first) == 0) {
+            if (resource_count < exp_ritr.second) {
+              resource_count = exp_ritr.second;
+            }
+          }
+        }
+        // per-device
+        for (const auto& exp_ritr :
+             explicit_max_resources_[PER_DEVICE_RESOURCE_KEY]) {
+          if (ritr.first.compare(exp_ritr.first) == 0) {
+            if (resource_count < exp_ritr.second) {
+              resource_count = exp_ritr.second;
+            }
+          }
+        }
+      }
+      if (resource_count < ritr.second) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            (std::string("Resource count for \"") + ritr.first +
+             "\" is limited to " + std::to_string(resource_count) +
+             " which will prevent scheduling of one or more model "
+             "instances, the minimum required count is " +
+             std::to_string(ritr.second))
+                .c_str());
+      } else {
+        ritr.second = resource_count;
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+bool
+RateLimiter::ResourceManager::AllocateResources(
+    const ModelInstanceContext* instance)
+{
+  std::lock_guard<std::mutex> lk1(model_resources_mtx_);
+  std::lock_guard<std::mutex> lk2(allocated_resources_mtx_);
+  const auto& itr = model_resources_.find(instance);
+  if (itr == model_resources_.end()) {
+    return false;
+  } else {
+    // First pass to verify if resources are available
+    {
+      std::lock_guard<std::mutex> lk3(max_resources_mtx_);
+      for (const auto& ditr : itr->second) {
+        auto allocated_ditr = allocated_resources_.find(ditr.first);
+        if (allocated_ditr == allocated_resources_.end()) {
+          allocated_ditr =
+              allocated_resources_
+                  .emplace(ditr.first, std::map<std::string, size_t>())
+                  .first;
+        }
+        for (const auto& ritr : ditr.second) {
+          auto allocated_ritr = allocated_ditr->second.find(ritr.first);
+          if (allocated_ritr == allocated_ditr->second.end()) {
+            allocated_ritr =
+                allocated_ditr->second.emplace(ritr.first, 0).first;
+          }
+          if ((allocated_ritr->second + ritr.second) >
+              (max_resources_[ditr.first])[ritr.first]) {
+            return false;
+          }
+        }
+      }
+    }
+
+    // Second pass to actually allocate the resources
+    for (const auto& ditr : itr->second) {
+      for (const auto& ritr : ditr.second) {
+        (allocated_resources_[ditr.first])[ritr.first] += ritr.second;
+      }
+    }
+  }
+
+  return true;
+}
+
+Status
+RateLimiter::ResourceManager::ReleaseResources(
+    const ModelInstanceContext* instance)
+{
+  std::lock_guard<std::mutex> lk1(model_resources_mtx_);
+  std::lock_guard<std::mutex> lk2(allocated_resources_mtx_);
+  const auto& itr = model_resources_.find(instance);
+  if (itr == model_resources_.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Unable find the instance resources to release");
+  } else {
+    for (const auto& ditr : itr->second) {
+      for (const auto& ritr : ditr.second) {
+        (allocated_resources_[ditr.first])[ritr.first] -= ritr.second;
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+RateLimiter::ResourceManager::ResourceManager(const ResourceMap& resource_map)
+    : explicit_max_resources_(resource_map)
+{
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/rate_limiter.h
+++ b/3rdparty/core-r22.12/src/rate_limiter.h
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <vector>
+
+#include "backend_model.h"
+#include "backend_model_instance.h"
+#include "instance_queue.h"
+#include "model_config.pb.h"
+#include "payload.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+// Limits the rate at which requests are dispatched to the model instances
+class RateLimiter {
+ public:
+  using RateLimiterConfig = inference::ModelRateLimiter;
+  using ResourceMap = std::map<int, std::map<std::string, size_t>>;
+  enum RESOURCE_KIND_KEY {
+    // Key for holding global resources
+    GLOBAL_RESOURCE_KEY = -2,
+    // Key for holding resources per each device
+    PER_DEVICE_RESOURCE_KEY = -1
+  };
+
+  /// Creates a rate limiter object which will funnel the requests to
+  /// the model instances. A typical lifetime of the model instance within
+  /// RateLimiter transition from available -> staged -> allocated -> available.
+  /// The transition from available to staged occurs when a request is
+  /// registered for the model. Depending upon the resource availabilty and
+  /// priority, the RateLimiter will transition an instance to allocated state
+  /// at some point in the future. The staged state is skipped when
+  /// configured to ignore the resource constraints. The cycle in this case
+  /// will be available -> allocated -> available.
+  /// \param ignore_resources_and_priority Whether or not to ignore resource
+  /// constraints and cross-model priority. An available instance is directly
+  /// allocated when true.
+  /// \param resource_map The map to the available resource count provided
+  /// explicitly.
+  /// \return Status object indicating success or failure.
+  static Status Create(
+      const bool ignore_resources_and_priority, const ResourceMap& resource_map,
+      std::unique_ptr<RateLimiter>* rate_limiter);
+
+  /// Registers the model instance with the rate limiter.
+  /// \param instance The pointer to the TritonModelInstance object to register
+  /// with the rate limiter.
+  /// \param rate_limiter_config The rate limiter configuration associated with
+  /// the model instance.
+  /// \return Status object indicating success or failure.
+  Status RegisterModelInstance(
+      TritonModelInstance* instance,
+      const RateLimiterConfig& rate_limiter_config);
+
+  /// Remove model from the set of models being managed by the rate limiter.
+  /// \param model The pointer to TritonModel object to be removed.
+  /// \return Status object indicating success or failure.
+  Status UnregisterModel(const TritonModel* model);
+
+  /// Returns true if there is a payload slot available for the given model.
+  /// \param model The pointer to TritonModel object to be removed.
+  /// \return slot availability in boolean.
+  bool PayloadSlotAvailable(const TritonModel* model);
+
+  /// Enqueues the payload to rate limiter for scheduling on the given model.
+  /// \param model The pointer to TritonModel object to be removed.
+  /// \param payload The shared pointer to the payload object.
+  /// \return Status object indicating success or failure.
+  Status EnqueuePayload(
+      const TritonModel* model, std::shared_ptr<Payload> payload);
+
+  /// Returns the payload that has been scheduled for the given set of model
+  /// instances. Note that this call is blocking and depends upon the
+  /// availability of payloads in the rate limiter for the triton model
+  /// instance.
+  /// \param instance The pointers to TritonModelInstance objects whose
+  /// payload is being requested.
+  /// \param payload The shared pointer to the payload object.
+  void DequeuePayload(
+      std::deque<TritonModelInstance*>& instance,
+      std::shared_ptr<Payload>* payload);
+
+  /// Returns a new payload object.
+  /// \param op_type The operation type for the payload.
+  /// \param instance Optional field that providess the model instance that must
+  /// be used for the execution of the payload. Default is nullptr which allows
+  /// any model instance to execute the payload.
+  /// \return The shared pointer to a new payload object.
+  std::shared_ptr<Payload> GetPayload(
+      const Payload::Operation op_type,
+      TritonModelInstance* instance = nullptr);
+
+  /// Releases the given payload object back to the rate limiter.
+  /// \param payload The payload to release.
+  void PayloadRelease(std::shared_ptr<Payload>& payload);
+
+ private:
+  class ModelInstanceContext;
+  class ModelContext;
+  struct PayloadQueue;
+  using StandardReleaseFunc = std::function<void(ModelInstanceContext*)>;
+  using StandardScheduleFunc = std::function<void(ModelInstanceContext*)>;
+  using StandardStageFunc = std::function<void(ModelInstanceContext*)>;
+
+  // Holds the state of the model instance.
+  class ModelInstanceContext {
+   public:
+    friend class RateLimiter;
+    friend class ResourceManager;
+    enum State { AVAILABLE, STAGED, ALLOCATED, REMOVED };
+
+    void Release();
+    TritonModelInstance* RawInstance() const { return triton_model_instance_; }
+
+   private:
+    ModelInstanceContext(
+        TritonModelInstance* triton_model_instance, ModelContext* model_context,
+        const RateLimiterConfig& rate_limiter_config, StandardStageFunc OnStage,
+        StandardReleaseFunc OnRelease);
+
+    const RateLimiterConfig* GetRateLimiterConfig() const
+    {
+      return &rate_limiter_config_;
+    }
+    void MarkAvailable();
+    double ScaledPriority();
+    Status Stage(StandardScheduleFunc OnSchedule);
+    Status Allocate();
+    Status DirectAllocate(StandardScheduleFunc OnSchedule);
+    void RequestRemoval();
+    void WaitForRemoval();
+
+    TritonModelInstance* triton_model_instance_;
+    size_t index_;
+    ModelContext* model_context_;
+    RateLimiterConfig rate_limiter_config_;
+    StandardStageFunc OnStage_;
+    StandardReleaseFunc OnRelease_;
+    std::atomic<uint64_t> exec_count_;
+
+    State state_;
+    bool removal_in_progress_;
+    std::mutex state_mtx_;
+
+    StandardScheduleFunc OnSchedule_;
+
+    std::condition_variable cv_;
+  };
+
+  class ScaledPriorityComparator {
+   public:
+    bool operator()(ModelInstanceContext* a, ModelInstanceContext* b)
+    {
+      return a->ScaledPriority() > b->ScaledPriority();
+    }
+  };
+
+  using PriorityQueue = std::priority_queue<
+      ModelInstanceContext*, std::vector<ModelInstanceContext*>,
+      ScaledPriorityComparator>;
+
+  // Holds the active context to a model
+  class ModelContext {
+   public:
+    ModelContext();
+
+    Status EnqueueModelInstanceRequest(
+        const StandardScheduleFunc& OnSchedule,
+        TritonModelInstance* triton_model_instance);
+    void AddAvailableInstance(ModelInstanceContext* instance);
+    void StageInstanceIfAvailable(TritonModelInstance* triton_model_instance);
+    void AllocateInstanceIfAvailable();
+    void AddSpecificRequestQueue();
+    bool ContainsPendingRequests(int32_t index);
+    void RequestRemoval();
+    bool isRemovalInProgress() { return removal_in_progress_; }
+
+   private:
+    bool removal_in_progress_;
+
+    // Queue holding pending scheduling request
+    std::queue<StandardScheduleFunc> generic_sched_request_queue_;
+    std::vector<std::queue<StandardScheduleFunc>>
+        specific_sched_request_queues_;
+    std::recursive_mutex sched_request_queue_mtx_;
+
+    // The set of instances that are available at the moment
+    PriorityQueue avbl_instances_;
+    std::recursive_mutex avbl_instances_mtx_;
+  };
+
+  // Manages and keep track of resource allocation to the model instances.
+  class ResourceManager {
+   public:
+    static Status Create(
+        const ResourceMap& resource_map,
+        std::unique_ptr<ResourceManager>* resource_manager);
+    void AddModelInstance(const ModelInstanceContext* instance);
+    Status RemoveModelInstance(const ModelInstanceContext* instance);
+    Status UpdateResourceLimits();
+    bool AllocateResources(const ModelInstanceContext* instance);
+    Status ReleaseResources(const ModelInstanceContext* instance);
+
+   private:
+    ResourceManager(const ResourceMap& resource_map);
+    Status ValidateMaxResources();
+    Status ParseAndValidateExplicitResources();
+
+    ResourceMap explicit_max_resources_;
+
+    std::map<const ModelInstanceContext*, ResourceMap> model_resources_;
+    std::mutex model_resources_mtx_;
+
+    ResourceMap max_resources_;
+    std::mutex max_resources_mtx_;
+
+    ResourceMap allocated_resources_;
+    std::mutex allocated_resources_mtx_;
+  };
+
+  RateLimiter(
+      const bool ignore_resources_and_priority,
+      const ResourceMap& resource_map);
+
+  void InitializePayloadQueues(const TritonModelInstance* instance);
+  Status DeferPayloadSchedule(
+      const StandardScheduleFunc& OnSchedule, const TritonModel* model,
+      TritonModelInstance* instance = nullptr);
+  void OnStage(ModelInstanceContext* instance_ptr);
+  void OnRelease(ModelInstanceContext* instance_ptr);
+  void AttemptAllocation();
+  void SchedulePayload(
+      TritonModelInstance* tmi, PayloadQueue* payload_queue,
+      const std::shared_ptr<Payload>& payload);
+
+  bool ignore_resources_and_priority_;
+
+  // Instance context for the models
+  std::map<
+      const TritonModel*, std::vector<std::shared_ptr<ModelInstanceContext>>>
+      model_instance_ctxs_;
+  std::mutex model_instance_ctx_mtx_;
+
+  // Running context of the models
+  std::map<const TritonModel*, ModelContext> model_contexts_;
+  std::mutex model_ctx_mtx_;
+
+  // Holds the model instances that have been staged
+  PriorityQueue staged_instances_;
+  std::recursive_mutex staged_instances_mtx_;
+
+  // Manager to keep track of the resource allocations
+  std::unique_ptr<ResourceManager> resource_manager_;
+
+  // Mutex to serialize Payload [de]allocation
+  std::mutex payload_mu_;
+
+  // Mutex to serialize Payload Queues deallocation
+  std::mutex payload_queues_mu_;
+
+  // Keep some number of Payload objects for reuse to avoid the overhead
+  // of creating a Payload for every new request.
+  const size_t max_payload_bucket_count_;
+  std::vector<std::shared_ptr<Payload>> payload_bucket_;
+  std::deque<std::shared_ptr<Payload>> payloads_in_use_;
+
+  struct PayloadQueue {
+    explicit PayloadQueue(size_t max_batch_size, uint64_t max_queue_delay_ns)
+    {
+      queue_.reset(new InstanceQueue(max_batch_size, max_queue_delay_ns));
+    }
+    std::unique_ptr<InstanceQueue> queue_;
+    std::map<const TritonModelInstance*, std::unique_ptr<InstanceQueue>>
+        specific_queues_;
+    std::mutex mu_;
+    std::condition_variable cv_;
+  };
+  std::map<const TritonModel*, std::unique_ptr<PayloadQueue>> payload_queues_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/repo_agent.cc
+++ b/3rdparty/core-r22.12/src/repo_agent.cc
+// Copyright 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "repo_agent.h"
+
+#include <string>
+#include "filesystem.h"
+#include "shared_library.h"
+#include "triton/common/logging.h"
+#include "tritonserver_apis.h"
+
+// For unknown reason, windows will not export the TRITONREPOAGENT_*
+// functions declared with dllexport in tritonrepoagent.h. To get
+// those functions exported it is (also?) necessary to mark the
+// definitions in this file with dllexport as well.
+#if defined(_MSC_VER)
+#define TRITONAPI_DECLSPEC __declspec(dllexport)
+#elif defined(__GNUC__)
+#define TRITONAPI_DECLSPEC __attribute__((__visibility__("default")))
+#else
+#define TRITONAPI_DECLSPEC
+#endif
+
+namespace triton { namespace core {
+
+std::string
+TritonRepoAgentLibraryName(const std::string& agent_name)
+{
+#ifdef _WIN32
+  return std::string("tritonrepoagent_") + agent_name + ".dll";
+#else
+  return std::string("libtritonrepoagent_") + agent_name + ".so";
+#endif
+}
+
+std::string
+TRITONREPOAGENT_ActionTypeString(const TRITONREPOAGENT_ActionType type)
+{
+  switch (type) {
+    case TRITONREPOAGENT_ACTION_LOAD:
+      return "TRITONREPOAGENT_ACTION_LOAD";
+    case TRITONREPOAGENT_ACTION_LOAD_COMPLETE:
+      return "TRITONREPOAGENT_ACTION_LOAD_COMPLETE";
+    case TRITONREPOAGENT_ACTION_LOAD_FAIL:
+      return "TRITONREPOAGENT_ACTION_LOAD_FAIL";
+    case TRITONREPOAGENT_ACTION_UNLOAD:
+      return "TRITONREPOAGENT_ACTION_UNLOAD";
+    case TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE:
+      return "TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE";
+  }
+  return "Unknown TRITONREPOAGENT_ActionType";
+}
+
+std::string
+TRITONREPOAGENT_ArtifactTypeString(const TRITONREPOAGENT_ArtifactType type)
+{
+  switch (type) {
+    case TRITONREPOAGENT_ARTIFACT_FILESYSTEM:
+      return "TRITONREPOAGENT_ARTIFACT_FILESYSTEM";
+    case TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM:
+      return "TRITONREPOAGENT_ARTIFACT_REMOTE_FILESYSTEM";
+  }
+  return "Unknown TRITONREPOAGENT_ArtifactType";
+}
+
+//
+// TritonRepoAgent
+//
+Status
+TritonRepoAgent::Create(
+    const std::string& name, const std::string& libpath,
+    std::shared_ptr<TritonRepoAgent>* agent)
+{
+  std::shared_ptr<TritonRepoAgent> lagent(new TritonRepoAgent(name));
+
+  {
+    std::unique_ptr<SharedLibrary> slib;
+    RETURN_IF_ERROR(SharedLibrary::Acquire(&slib));
+
+    RETURN_IF_ERROR(slib->OpenLibraryHandle(libpath, &lagent->dlhandle_));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        lagent->dlhandle_, "TRITONREPOAGENT_Initialize", true /* optional */,
+        reinterpret_cast<void**>(&lagent->init_fn_)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        lagent->dlhandle_, "TRITONREPOAGENT_Finalize", true /* optional */,
+        reinterpret_cast<void**>(&lagent->fini_fn_)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        lagent->dlhandle_, "TRITONREPOAGENT_ModelInitialize",
+        true /* optional */,
+        reinterpret_cast<void**>(&lagent->model_init_fn_)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        lagent->dlhandle_, "TRITONREPOAGENT_ModelFinalize", true /* optional */,
+        reinterpret_cast<void**>(&lagent->model_fini_fn_)));
+    RETURN_IF_ERROR(slib->GetEntrypoint(
+        lagent->dlhandle_, "TRITONREPOAGENT_ModelAction", false /* optional */,
+        reinterpret_cast<void**>(&lagent->model_action_fn_)));
+  }
+
+  // Initialize if needed
+  if (lagent->init_fn_ != nullptr) {
+    RETURN_IF_TRITONSERVER_ERROR(lagent->init_fn_(
+        reinterpret_cast<TRITONREPOAGENT_Agent*>(lagent.get())));
+  }
+
+  *agent = std::move(lagent);
+  return Status::Success;
+}
+
+TritonRepoAgent::~TritonRepoAgent()
+{
+  // Finalize if needed
+  if (fini_fn_ != nullptr) {
+    auto err = fini_fn_(reinterpret_cast<TRITONREPOAGENT_Agent*>(this));
+    if (err != nullptr) {
+      LOG_ERROR << "~TritonRepoAgent: "
+                << Status(
+                       TritonCodeToStatusCode(TRITONSERVER_ErrorCode(err)),
+                       TRITONSERVER_ErrorMessage(err))
+                       .AsString();
+      TRITONSERVER_ErrorDelete(err);
+    };
+  }
+
+  {
+    std::unique_ptr<SharedLibrary> slib;
+    LOG_STATUS_ERROR(SharedLibrary::Acquire(&slib), "~TritonRepoAgent");
+    LOG_STATUS_ERROR(slib->CloseLibraryHandle(dlhandle_), "~TritonRepoAgent");
+  }
+}
+
+//
+// TritonRepoAgentModel
+//
+Status
+TritonRepoAgentModel::Create(
+    const TRITONREPOAGENT_ArtifactType type, const std::string& location,
+    const inference::ModelConfig& config,
+    const std::shared_ptr<TritonRepoAgent>& agent,
+    const TritonRepoAgent::Parameters& agent_parameters,
+    std::unique_ptr<TritonRepoAgentModel>* agent_model)
+{
+  std::unique_ptr<TritonRepoAgentModel> lagent_model(new TritonRepoAgentModel(
+      type, location, config, agent, agent_parameters));
+  if (agent->AgentModelInitFn() != nullptr) {
+    RETURN_IF_TRITONSERVER_ERROR(agent->AgentModelInitFn()(
+        reinterpret_cast<TRITONREPOAGENT_Agent*>(agent.get()),
+        reinterpret_cast<TRITONREPOAGENT_AgentModel*>(lagent_model.get())));
+  }
+  *agent_model = std::move(lagent_model);
+  return Status::Success;
+}
+
+TritonRepoAgentModel::~TritonRepoAgentModel()
+{
+  // Need to ensure the proper lifecycle is informed
+  if (action_type_set_) {
+    switch (current_action_type_) {
+      case TRITONREPOAGENT_ACTION_LOAD:
+        LOG_TRITONSERVER_ERROR(
+            agent_->AgentModelActionFn()(
+                reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+                reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this),
+                TRITONREPOAGENT_ACTION_LOAD_FAIL),
+            "Inform TRITONREPOAGENT_ACTION_LOAD_FAIL");
+        break;
+      case TRITONREPOAGENT_ACTION_LOAD_COMPLETE:
+        LOG_TRITONSERVER_ERROR(
+            agent_->AgentModelActionFn()(
+                reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+                reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this),
+                TRITONREPOAGENT_ACTION_UNLOAD),
+            "Inform TRITONREPOAGENT_ACTION_UNLOAD");
+        // Fallthough is not yet an language feature until C++17
+        LOG_TRITONSERVER_ERROR(
+            agent_->AgentModelActionFn()(
+                reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+                reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this),
+                TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE),
+            "Inform TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE");
+        break;
+      case TRITONREPOAGENT_ACTION_UNLOAD:
+        LOG_TRITONSERVER_ERROR(
+            agent_->AgentModelActionFn()(
+                reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+                reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this),
+                TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE),
+            "Inform TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE");
+        break;
+      case TRITONREPOAGENT_ACTION_LOAD_FAIL:
+      case TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE:
+        break;
+    }
+  }
+  if (agent_->AgentModelFiniFn() != nullptr) {
+    LOG_TRITONSERVER_ERROR(
+        agent_->AgentModelFiniFn()(
+            reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+            reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this)),
+        "~TritonRepoAgentModel");
+  }
+  if (!acquired_location_.empty()) {
+    DeleteMutableLocation();
+  }
+}
+
+Status
+TritonRepoAgentModel::InvokeAgent(const TRITONREPOAGENT_ActionType action_type)
+{
+  if ((!action_type_set_) && (action_type != TRITONREPOAGENT_ACTION_LOAD)) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Unexpected lifecycle start state " +
+            TRITONREPOAGENT_ActionTypeString(action_type));
+  }
+  switch (action_type) {
+    case TRITONREPOAGENT_ACTION_LOAD:
+      if (action_type_set_) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Unexpected lifecycle state transition from " +
+                TRITONREPOAGENT_ActionTypeString(current_action_type_) +
+                " to " + TRITONREPOAGENT_ActionTypeString(action_type));
+      }
+      break;
+    case TRITONREPOAGENT_ACTION_LOAD_COMPLETE:
+    case TRITONREPOAGENT_ACTION_LOAD_FAIL:
+      if (current_action_type_ != TRITONREPOAGENT_ACTION_LOAD) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Unexpected lifecycle state transition from " +
+                TRITONREPOAGENT_ActionTypeString(current_action_type_) +
+                " to " + TRITONREPOAGENT_ActionTypeString(action_type));
+      }
+      break;
+    case TRITONREPOAGENT_ACTION_UNLOAD:
+      if (current_action_type_ != TRITONREPOAGENT_ACTION_LOAD_COMPLETE) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Unexpected lifecycle state transition from " +
+                TRITONREPOAGENT_ActionTypeString(current_action_type_) +
+                " to " + TRITONREPOAGENT_ActionTypeString(action_type));
+      }
+      break;
+    case TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE:
+      if (current_action_type_ != TRITONREPOAGENT_ACTION_UNLOAD) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Unexpected lifecycle state transition from " +
+                TRITONREPOAGENT_ActionTypeString(current_action_type_) +
+                " to " + TRITONREPOAGENT_ActionTypeString(action_type));
+      }
+      break;
+  }
+  current_action_type_ = action_type;
+  action_type_set_ = true;
+  RETURN_IF_TRITONSERVER_ERROR(agent_->AgentModelActionFn()(
+      reinterpret_cast<TRITONREPOAGENT_Agent*>(agent_.get()),
+      reinterpret_cast<TRITONREPOAGENT_AgentModel*>(this), action_type));
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentModel::SetLocation(
+    const TRITONREPOAGENT_ArtifactType type, const std::string& location)
+{
+  if (current_action_type_ != TRITONREPOAGENT_ACTION_LOAD) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "location can only be updated during TRITONREPOAGENT_ACTION_LOAD, "
+        "current action type is " +
+            (action_type_set_
+                 ? TRITONREPOAGENT_ActionTypeString(current_action_type_)
+                 : "not set"));
+  }
+  type_ = type;
+  location_ = location;
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentModel::Location(
+    TRITONREPOAGENT_ArtifactType* type, const char** location)
+{
+  if (location_.empty()) {
+    return Status(
+        Status::Code::INTERNAL, "Model repository location is not set");
+  }
+  *type = type_;
+  *location = location_.c_str();
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentModel::AcquireMutableLocation(
+    const TRITONREPOAGENT_ArtifactType type, const char** location)
+{
+  if (type != TRITONREPOAGENT_ARTIFACT_FILESYSTEM) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Unexpected artifact type, expects "
+        "'TRITONREPOAGENT_ARTIFACT_FILESYSTEM'");
+  }
+  if (acquired_location_.empty()) {
+    std::string lacquired_location;
+    RETURN_IF_ERROR(
+        MakeTemporaryDirectory(FileSystemType::LOCAL, &lacquired_location));
+    acquired_location_.swap(lacquired_location);
+    acquired_type_ = type;
+  }
+  *location = acquired_location_.c_str();
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentModel::DeleteMutableLocation()
+{
+  if (acquired_location_.empty()) {
+    return Status(
+        Status::Code::UNAVAILABLE, "No mutable location to be deleted");
+  }
+
+  auto status = DeletePath(acquired_location_);
+  if (!status.IsOk()) {
+    LOG_ERROR << "Failed to delete previously acquired location '"
+              << acquired_location_ << "': " << status.AsString();
+  }
+  acquired_location_.clear();
+  return Status::Success;
+}
+
+//
+// TritonRepoAgentManager
+//
+TritonRepoAgentManager&
+TritonRepoAgentManager::Singleton()
+{
+  static TritonRepoAgentManager triton_repo_agent_manager;
+  return triton_repo_agent_manager;
+}
+
+Status
+TritonRepoAgentManager::SetGlobalSearchPath(const std::string& path)
+{
+  auto& singleton_manager = Singleton();
+  std::lock_guard<std::mutex> lock(singleton_manager.mu_);
+  singleton_manager.global_search_path_ = path;
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentManager::CreateAgent(
+    const std::string& agent_name, std::shared_ptr<TritonRepoAgent>* agent)
+{
+  auto& singleton_manager = Singleton();
+  std::lock_guard<std::mutex> lock(singleton_manager.mu_);
+
+  // Get the path to the agent shared library. Search path is global
+  // agent directory.  FIXME expose global path as Triton option
+  const std::vector<std::string> search_paths = {
+      JoinPath({singleton_manager.global_search_path_, agent_name})};
+
+  std::string agent_libname = TritonRepoAgentLibraryName(agent_name);
+  std::string libpath;
+  for (const auto& path : search_paths) {
+    const auto full_path = JoinPath({path, agent_libname});
+    bool exists = false;
+    RETURN_IF_ERROR(FileExists(full_path, &exists));
+    if (exists) {
+      libpath = full_path;
+      break;
+    }
+  }
+
+  if (libpath.empty()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "unable to find '" + agent_libname + "' for repo agent '" + agent_name +
+            "', searched: " + singleton_manager.global_search_path_);
+  }
+
+  const auto& itr = singleton_manager.agent_map_.find(libpath);
+  if (itr != singleton_manager.agent_map_.end()) {
+    // Found in map. If the weak_ptr is still valid that means that
+    // there are other models using the agent and we just reuse that
+    // same agent. If the weak_ptr is not valid then agent has been
+    // unloaded so we need to remove the weak_ptr from the map and
+    // create the agent again.
+    *agent = itr->second.lock();
+    if (*agent != nullptr) {
+      return Status::Success;
+    }
+
+    singleton_manager.agent_map_.erase(itr);
+  }
+  RETURN_IF_ERROR(TritonRepoAgent::Create(agent_name, libpath, agent));
+  singleton_manager.agent_map_.insert({libpath, *agent});
+
+  return Status::Success;
+}
+
+Status
+TritonRepoAgentManager::AgentState(
+    std::unique_ptr<std::unordered_map<std::string, std::string>>* agent_state)
+{
+  auto& singleton_manager = Singleton();
+  std::lock_guard<std::mutex> lock(singleton_manager.mu_);
+
+  std::unique_ptr<std::unordered_map<std::string, std::string>> agent_state_map(
+      new std::unordered_map<std::string, std::string>);
+  for (const auto& agent_pair : singleton_manager.agent_map_) {
+    auto& libpath = agent_pair.first;
+    auto agent = agent_pair.second.lock();
+
+    if (agent != nullptr) {
+      agent_state_map->insert({agent->Name(), libpath});
+    }
+  }
+
+  *agent_state = std::move(agent_state_map);
+
+  return Status::Success;
+}
+
+extern "C" {
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ApiVersion(uint32_t* major, uint32_t* minor)
+{
+  *major = TRITONREPOAGENT_API_VERSION_MAJOR;
+  *minor = TRITONREPOAGENT_API_VERSION_MINOR;
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocation(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    TRITONREPOAGENT_ArtifactType* artifact_type, const char** location)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  RETURN_TRITONSERVER_ERROR_IF_ERROR(tam->Location(artifact_type, location));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocationAcquire(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const TRITONREPOAGENT_ArtifactType artifact_type, const char** location)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  RETURN_TRITONSERVER_ERROR_IF_ERROR(
+      tam->AcquireMutableLocation(artifact_type, location));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryLocationRelease(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const char* location)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  RETURN_TRITONSERVER_ERROR_IF_ERROR(tam->DeleteMutableLocation());
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelRepositoryUpdate(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const TRITONREPOAGENT_ArtifactType artifact_type, const char* location)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  RETURN_TRITONSERVER_ERROR_IF_ERROR(tam->SetLocation(artifact_type, location));
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelParameterCount(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    uint32_t* count)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  *count = tam->AgentParameters().size();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelParameter(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const uint32_t index, const char** parameter_name,
+    const char** parameter_value)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  const auto& params = tam->AgentParameters();
+  if (index >= params.size()) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        "index out of range for model parameters");
+  }
+  *parameter_name = params[index].first.c_str();
+  *parameter_value = params[index].second.c_str();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelConfig(
+    TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+    const uint32_t config_version, TRITONSERVER_Message** model_config)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  std::string model_config_json;
+  RETURN_TRITONSERVER_ERROR_IF_ERROR(
+      ModelConfigToJson(tam->Config(), config_version, &model_config_json));
+  return TRITONSERVER_MessageNewFromSerializedJson(
+      model_config, model_config_json.c_str(), model_config_json.length());
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelState(TRITONREPOAGENT_AgentModel* model, void** state)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  *state = tam->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_ModelSetState(TRITONREPOAGENT_AgentModel* model, void* state)
+{
+  TritonRepoAgentModel* tam = reinterpret_cast<TritonRepoAgentModel*>(model);
+  tam->SetState(state);
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_State(TRITONREPOAGENT_Agent* agent, void** state)
+{
+  TritonRepoAgent* ta = reinterpret_cast<TritonRepoAgent*>(agent);
+  *state = ta->State();
+  return nullptr;  // success
+}
+
+TRITONAPI_DECLSPEC TRITONSERVER_Error*
+TRITONREPOAGENT_SetState(TRITONREPOAGENT_Agent* agent, void* state)
+{
+  TritonRepoAgent* ta = reinterpret_cast<TritonRepoAgent*>(agent);
+  ta->SetState(state);
+  return nullptr;  // success
+}
+
+}  // extern C
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/repo_agent.h
+++ b/3rdparty/core-r22.12/src/repo_agent.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "tritonserver_apis.h"
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "constants.h"
+#include "model_config_utils.h"
+
+namespace triton { namespace core {
+
+std::string TritonRepoAgentLibraryName(const std::string& agent_name);
+
+std::string TRITONREPOAGENT_ActionTypeString(
+    const TRITONREPOAGENT_ActionType type);
+
+std::string TRITONREPOAGENT_ArtifactTypeString(
+    const TRITONREPOAGENT_ArtifactType type);
+
+class TritonRepoAgent {
+ public:
+  using Parameters = std::vector<std::pair<std::string, std::string>>;
+  typedef TRITONSERVER_Error* (*TritonRepoAgentInitFn_t)(
+      TRITONREPOAGENT_Agent* agent);
+  typedef TRITONSERVER_Error* (*TritonRepoAgentFiniFn_t)(
+      TRITONREPOAGENT_Agent* agent);
+  typedef TRITONSERVER_Error* (*TritonRepoAgentModelInitFn_t)(
+      TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
+  typedef TRITONSERVER_Error* (*TritonRepoAgentModelFiniFn_t)(
+      TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model);
+  typedef TRITONSERVER_Error* (*TritonRepoAgentModelActionFn_t)(
+      TRITONREPOAGENT_Agent* agent, TRITONREPOAGENT_AgentModel* model,
+      const TRITONREPOAGENT_ActionType action_type);
+
+  static Status Create(
+      const std::string& name, const std::string& libpath,
+      std::shared_ptr<TritonRepoAgent>* agent);
+  ~TritonRepoAgent();
+
+  const std::string& Name() { return name_; }
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  TritonRepoAgentModelActionFn_t AgentModelActionFn() const
+  {
+    return model_action_fn_;
+  }
+
+  TritonRepoAgentModelInitFn_t AgentModelInitFn() const
+  {
+    return model_init_fn_;
+  }
+
+  TritonRepoAgentModelFiniFn_t AgentModelFiniFn() const
+  {
+    return model_fini_fn_;
+  }
+
+ protected:
+  DISALLOW_COPY_AND_ASSIGN(TritonRepoAgent);
+
+  TritonRepoAgent(const std::string& name)
+      : name_(name), state_(nullptr), dlhandle_(nullptr), init_fn_(nullptr),
+        fini_fn_(nullptr), model_init_fn_(nullptr), model_fini_fn_(nullptr),
+        model_action_fn_(nullptr)
+  {
+  }
+  const std::string name_;
+  void* state_;
+
+  // dlopen / dlsym handles
+  void* dlhandle_;
+  TritonRepoAgentInitFn_t init_fn_;
+  TritonRepoAgentFiniFn_t fini_fn_;
+  TritonRepoAgentModelInitFn_t model_init_fn_;
+  TritonRepoAgentModelFiniFn_t model_fini_fn_;
+  TritonRepoAgentModelActionFn_t model_action_fn_;
+};
+
+class TritonRepoAgentModel {
+ public:
+  static Status Create(
+      const TRITONREPOAGENT_ArtifactType type, const std::string& location,
+      const inference::ModelConfig& config,
+      const std::shared_ptr<TritonRepoAgent>& agent,
+      const TritonRepoAgent::Parameters& agent_parameters,
+      std::unique_ptr<TritonRepoAgentModel>* agent_model);
+  ~TritonRepoAgentModel();
+
+  void* State() { return state_; }
+  void SetState(void* state) { state_ = state; }
+
+  Status InvokeAgent(const TRITONREPOAGENT_ActionType action_type);
+  const TritonRepoAgent::Parameters& AgentParameters()
+  {
+    return agent_parameters_;
+  }
+
+  Status SetLocation(
+      const TRITONREPOAGENT_ArtifactType type, const std::string& location);
+  Status Location(TRITONREPOAGENT_ArtifactType* type, const char** location);
+  Status AcquireMutableLocation(
+      const TRITONREPOAGENT_ArtifactType type, const char** location);
+  Status DeleteMutableLocation();
+  const inference::ModelConfig Config() { return config_; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonRepoAgentModel);
+
+  TritonRepoAgentModel(
+      const TRITONREPOAGENT_ArtifactType type, const std::string& location,
+      const inference::ModelConfig& config,
+      const std::shared_ptr<TritonRepoAgent>& agent,
+      const TritonRepoAgent::Parameters& agent_parameters)
+      : state_(nullptr), config_(config), agent_(agent),
+        agent_parameters_(agent_parameters), type_(type), location_(location),
+        action_type_set_(false),
+        current_action_type_(TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE)
+  {
+  }
+
+  void* state_;
+  const inference::ModelConfig config_;
+  const std::shared_ptr<TritonRepoAgent> agent_;
+  const TritonRepoAgent::Parameters agent_parameters_;
+  TRITONREPOAGENT_ArtifactType type_;
+  std::string location_;
+  TRITONREPOAGENT_ArtifactType acquired_type_;
+  std::string acquired_location_;
+  bool action_type_set_;
+  TRITONREPOAGENT_ActionType current_action_type_;
+};
+
+class TritonRepoAgentManager {
+ public:
+  static Status SetGlobalSearchPath(const std::string& path);
+  static Status CreateAgent(
+      const std::string& agent_name, std::shared_ptr<TritonRepoAgent>* agent);
+
+  static Status AgentState(
+      std::unique_ptr<std::unordered_map<std::string, std::string>>*
+          agent_state);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(TritonRepoAgentManager);
+
+  TritonRepoAgentManager()
+      : global_search_path_("/opt/tritonserver/repoagents"){};
+  static TritonRepoAgentManager& Singleton();
+  std::mutex mu_;
+  std::string global_search_path_;
+  std::unordered_map<std::string, std::weak_ptr<TritonRepoAgent>> agent_map_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/response_allocator.h
+++ b/3rdparty/core-r22.12/src/response_allocator.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+//
+// Implementation for TRITONSERVER_ResponseAllocator.
+//
+class ResponseAllocator {
+ public:
+  explicit ResponseAllocator(
+      TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
+      TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
+      TRITONSERVER_ResponseAllocatorStartFn_t start_fn)
+      : alloc_fn_(alloc_fn), buffer_attributes_fn_(nullptr), query_fn_(nullptr),
+        release_fn_(release_fn), start_fn_(start_fn)
+  {
+  }
+
+  void SetQueryFunction(TRITONSERVER_ResponseAllocatorQueryFn_t query_fn)
+  {
+    query_fn_ = query_fn;
+  }
+
+  void SetBufferAttributesFunction(
+      TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn)
+  {
+    buffer_attributes_fn_ = buffer_attributes_fn;
+  }
+
+  TRITONSERVER_ResponseAllocatorAllocFn_t AllocFn() const { return alloc_fn_; }
+  TRITONSERVER_ResponseAllocatorBufferAttributesFn_t BufferAttributesFn() const
+  {
+    return buffer_attributes_fn_;
+  }
+  TRITONSERVER_ResponseAllocatorQueryFn_t QueryFn() const { return query_fn_; }
+  TRITONSERVER_ResponseAllocatorReleaseFn_t ReleaseFn() const
+  {
+    return release_fn_;
+  }
+  TRITONSERVER_ResponseAllocatorStartFn_t StartFn() const { return start_fn_; }
+
+ private:
+  TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn_;
+  TRITONSERVER_ResponseAllocatorBufferAttributesFn_t buffer_attributes_fn_;
+  TRITONSERVER_ResponseAllocatorQueryFn_t query_fn_;
+  TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn_;
+  TRITONSERVER_ResponseAllocatorStartFn_t start_fn_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/response_cache.cc
+++ b/3rdparty/core-r22.12/src/response_cache.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "response_cache.h"
+#include "infer_stats.h"
+#include "triton/common/logging.h"
+
+namespace {
+
+enum class ScopedTimerType { INSERTION, LOOKUP };
+
+class ScopedTimer {
+ public:
+  explicit ScopedTimer(
+      triton::core::InferenceRequest& request, uint64_t& duration,
+      ScopedTimerType type)
+      : request_(request), duration_(duration), type_(type)
+  {
+    switch (type_) {
+      case ScopedTimerType::LOOKUP:
+        request_.CaptureCacheLookupStartNs();
+        break;
+      case ScopedTimerType::INSERTION:
+        request_.CaptureCacheInsertionStartNs();
+        break;
+    }
+  }
+
+  ~ScopedTimer()
+  {
+    switch (type_) {
+      case ScopedTimerType::LOOKUP:
+        request_.CaptureCacheLookupEndNs();
+        duration_ +=
+            request_.CacheLookupEndNs() - request_.CacheLookupStartNs();
+        break;
+      case ScopedTimerType::INSERTION:
+        request_.CaptureCacheInsertionEndNs();
+        duration_ +=
+            request_.CacheInsertionEndNs() - request_.CacheInsertionStartNs();
+        break;
+    }
+  }
+
+ private:
+  triton::core::InferenceRequest& request_;
+  uint64_t& duration_;
+  ScopedTimerType type_;
+};
+
+std::string
+PointerToString(void* ptr)
+{
+  std::stringstream ss;
+  ss << ptr;
+  return ss.str();
+}
+
+}  // namespace
+
+namespace triton { namespace core {
+
+Status
+RequestResponseCache::Create(
+    uint64_t cache_size, std::unique_ptr<RequestResponseCache>* cache)
+{
+  try {
+    cache->reset(new RequestResponseCache(cache_size));
+  }
+  catch (const std::exception& ex) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Failed to initialize Response Cache: " + std::string(ex.what()));
+  }
+
+  return Status::Success;
+}
+
+RequestResponseCache::RequestResponseCache(const uint64_t size)
+{
+  // Allocate buffer
+  buffer_ = malloc(size);
+  // Exit early if buffer allocation failed
+  if (buffer_ == nullptr) {
+    throw std::runtime_error("failed to allocate buffer");
+  }
+
+  // Create cache as managed buffer
+  managed_buffer_ = boost::interprocess::managed_external_buffer(
+      boost::interprocess::create_only_t{}, buffer_, size);
+
+  LOG_INFO << "Response Cache is created at '" << PointerToString(buffer_)
+           << "' with size " << size;
+}
+
+RequestResponseCache::~RequestResponseCache()
+{
+  // Deallocate each chunk from managed buffer
+  for (auto& iter : cache_) {
+    auto& entry = iter.second;
+    for (auto& output : entry.outputs_) {
+      if (output.buffer_ != nullptr) {
+        managed_buffer_.deallocate(output.buffer_);
+      }
+    }
+  }
+
+  // Validate we freed all underlying memory managed by cache
+  if (!managed_buffer_.all_memory_deallocated()) {
+    // Destructors can't throw exceptions
+    LOG_ERROR << "failed to free managed cache memory";
+  }
+
+  // Free total cache buffer
+  if (buffer_ != nullptr) {
+    free(buffer_);
+  }
+}
+
+Status
+RequestResponseCache::Lookup(
+    InferenceResponse* const response, InferenceRequest* const request)
+{
+  // Lock on cache lookup
+  std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+
+  if (request == nullptr) {
+    return Status(
+        Status::Code::INTERNAL, "Cache Lookup passed a nullptr request");
+  }
+
+  // Capture start latency now and end latency when timer goes out of scope
+  ScopedTimer timer(
+      *request, total_lookup_latency_ns_, ScopedTimerType::LOOKUP);
+
+  // Hash the request and set cache key if it hasn't already been set
+  if (!request->CacheKeyIsSet()) {
+    RETURN_IF_ERROR(HashAndSet(request));
+  }
+  const uint64_t key = request->CacheKey();
+
+  num_lookups_++;
+  LOG_VERBOSE(1) << request->LogRequest()
+                 << "Looking up key [" + std::to_string(key) + "] in cache.";
+
+  // Search cache for request hash key
+  auto iter = cache_.find(key);
+  if (iter == cache_.end()) {
+    num_misses_++;
+    LOG_VERBOSE(1) << request->LogRequest()
+                   << "MISS for key [" + std::to_string(key) + "] in cache.";
+    return Status(
+        Status::Code::INTERNAL,
+        request->LogRequest() + "key not found in cache");
+  }
+
+  // If find succeeds, it's a cache hit
+  num_hits_++;
+  LOG_VERBOSE(1) << request->LogRequest()
+                 << "HIT for key [" + std::to_string(key) + "] in cache.";
+
+  // Populate passed-in "response" from cache entry
+  auto entry = iter->second;
+  // Build InferenceResponse from CacheEntry
+  RETURN_IF_ERROR(BuildInferenceResponse(entry, response));
+
+  // Update this key to front of LRU list
+  UpdateLRU(iter);
+  LOG_VERBOSE(1) << request->LogRequest()
+                 << "Using cached response for key [" + std::to_string(key) +
+                        "].";
+  return Status::Success;
+}
+
+Status
+RequestResponseCache::Insert(
+    const InferenceResponse& response, InferenceRequest* const request)
+{
+  // Lock on cache insertion
+  std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+
+  if (request == nullptr) {
+    return Status(
+        Status::Code::INTERNAL, "Cache Insert passed a nullptr request");
+  }
+
+  // Capture start latency now and end latency when timer goes out of scope
+  ScopedTimer timer(
+      *request, total_insertion_latency_ns_, ScopedTimerType::INSERTION);
+
+  // Hash the request and set cache key if it hasn't already been set
+  if (!request->CacheKeyIsSet()) {
+    RETURN_IF_ERROR(HashAndSet(request));
+  }
+  const uint64_t key = request->CacheKey();
+
+  // Exit early if key already exists in cache
+  auto iter = cache_.find(key);
+  if (iter != cache_.end()) {
+    return Status(
+        Status::Code::ALREADY_EXISTS, request->LogRequest() + "key [" +
+                                          std::to_string(key) +
+                                          "] already exists in cache");
+  }
+
+  // Construct cache entry from response
+  auto entry = CacheEntry();
+  RETURN_IF_ERROR(BuildCacheEntry(response, &entry));
+
+  // Insert entry into cache
+  LOG_VERBOSE(1) << request->LogRequest()
+                 << "Inserting key [" + std::to_string(key) + "] into cache.";
+  auto cache_pair = cache_.insert({key, entry});
+  // Exit early if cache insertion failed
+  if (!cache_pair.second) {
+    LOG_ERROR << request->LogRequest() << "Failed to insert key into map.";
+    return Status(
+        Status::Code::INTERNAL,
+        request->LogRequest() + "Cache insertion failed");
+  }
+  // Update LRU with new cache entry
+  auto cache_iter = cache_pair.first;
+  UpdateLRU(cache_iter);
+
+  return Status::Success;
+}
+
+// LRU
+Status
+RequestResponseCache::Evict()
+{
+  // Lock on cache eviction
+  std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+
+  // Nothing to evict if cache is empty
+  if (NumEntries() == 0) {
+    return Status(Status::Code::INTERNAL, "Cache is empty, nothing to evict.");
+  }
+
+  // Least recently used key in back of LRU list
+  uint64_t lru_key = lru_.back();
+  LOG_VERBOSE(1) << "Evicting key [" + std::to_string(lru_key) +
+                        "] from cache.";
+
+  // Find cache entry for least recently used key
+  auto iter = cache_.find(lru_key);
+  // Error check if key isn't in cache, but this shouldn't happen in evict
+  // and probably indicates a bug
+  if (iter == cache_.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "key [" + std::to_string(lru_key) +
+            "] not found in cache during eviction: this indicates a bug in the "
+            "code");
+  }
+  // Get size of cache entry being evicted to update available size
+  auto entry = iter->second;
+  // Free managed memory used in cache entry's outputs
+  for (auto& output : entry.outputs_) {
+    // Lock on buffer deallocation
+    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+    managed_buffer_.deallocate(output.buffer_);
+  }
+
+  // Remove LRU entry from cache
+  cache_.erase(lru_key);
+  // Remove LRU key from LRU list
+  lru_.pop_back();
+  // Increment number of evictions
+  num_evictions_++;
+
+  return Status::Success;
+}
+
+// Helpers
+void
+RequestResponseCache::UpdateLRU(
+    std::unordered_map<uint64_t, CacheEntry>::iterator& cache_iter)
+{
+  // Lock on cache update
+  std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+
+  const auto& key = cache_iter->first;
+  auto& cache_entry = cache_iter->second;
+  // Remove key from LRU list if it was already in there
+  auto lru_iter = std::find(lru_.begin(), lru_.end(), key);
+  if (lru_iter != lru_.end()) {
+    lru_.erase(lru_iter);
+  }
+  // Add key to front of LRU list since it's most recently used
+  lru_.push_front(key);
+  // Set CacheEntry LRU iterator to new LRU key location
+  cache_entry.lru_iter_ = lru_.begin();
+}
+
+Status
+RequestResponseCache::BuildCacheEntry(
+    const InferenceResponse& response, CacheEntry* const entry)
+{
+  // Build cache entry data from response outputs
+  for (const auto& response_output : response.Outputs()) {
+    auto cache_output = Output();
+
+    // Fetch output buffer details
+    const void* response_buffer = nullptr;
+    size_t response_byte_size = 0;
+    TRITONSERVER_MemoryType response_memory_type;
+    int64_t response_memory_type_id;
+    void* userp;
+    RETURN_IF_ERROR(response_output.DataBuffer(
+        &response_buffer, &response_byte_size, &response_memory_type,
+        &response_memory_type_id, &userp));
+
+    // TODO: Handle other memory types
+    if (response_memory_type != TRITONSERVER_MEMORY_CPU &&
+        response_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Only input buffers in CPU memory are allowed in cache currently");
+    }
+
+    // Exit early if response buffer from output is invalid
+    if (response_buffer == nullptr) {
+      return Status(
+          Status::Code::INTERNAL, "Response buffer from output was nullptr");
+    }
+
+    // Lock on managed buffer references
+    {
+      std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+
+      // Exit early if cache entry will be larger than available cache size
+      if (response_byte_size > managed_buffer_.get_size()) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Cache entry is larger than total cache size");
+      }
+
+      // If cache doesn't have enough space, evict until enough space available
+      // NOTE: FreeBytes() doesn't account for allocator overhead so allocation
+      //       may fail even if response_byte_size is less than FreeBytes()
+      while (response_byte_size > FreeBytes()) {
+        LOG_VERBOSE(1) << "EVICT: Response larger than remaining available "
+                          "memory, attempting to evict from cache.";
+        RETURN_IF_ERROR(Evict());
+      }
+
+      // Attempt to allocate buffer until success or eviction from cache fails
+      while (cache_output.buffer_ == nullptr) {
+        // Allocate buffer for response output in cache entry
+        cache_output.buffer_ =
+            managed_buffer_.allocate(response_byte_size, std::nothrow_t{});
+        // Attempt to evict if allocation fails
+        if (cache_output.buffer_ == nullptr) {
+          LOG_VERBOSE(1) << "FAILED to allocate buffer in cache. Attempting to "
+                            "evict an entry.";
+          // Exit out if Eviction fails
+          RETURN_IF_ERROR(Evict());
+        }
+      }
+
+      // Copy data from response buffer to cache entry output buffer
+      // TODO: Handle other memory types
+      std::memcpy(cache_output.buffer_, response_buffer, response_byte_size);
+
+      // Set output metadata
+      cache_output.name_ = response_output.Name();
+      cache_output.dtype_ = response_output.DType();
+      cache_output.shape_ = response_output.Shape();
+      cache_output.buffer_size_ = static_cast<uint64_t>(response_byte_size);
+    }
+
+    // Add each output to cache entry
+    entry->outputs_.push_back(cache_output);
+  }
+
+  return Status::Success;
+}
+
+
+Status
+RequestResponseCache::BuildInferenceResponse(
+    const CacheEntry& entry, InferenceResponse* const response)
+{
+  if (response == nullptr) {
+    return Status(Status::Code::INTERNAL, "invalid response ptr passed in");
+  }
+
+  // Lock on cache references
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+
+    // Inference response outputs should be empty so we can append to them
+    if (response->Outputs().size() != 0) {
+      return Status(
+          Status::Code::INTERNAL,
+          "InferenceResponse already contains some outputs");
+    }
+
+    for (auto& cache_output : entry.outputs_) {
+      InferenceResponse::Output* response_output = nullptr;
+      RETURN_IF_ERROR(response->AddOutput(
+          cache_output.name_, cache_output.dtype_, cache_output.shape_,
+          &response_output));
+
+      if (response_output == nullptr) {
+        return Status(
+            Status::Code::INTERNAL,
+            "InferenceResponse::Output pointer as nullptr");
+      }
+
+      TRITONSERVER_MemoryType memory_type = TRITONSERVER_MEMORY_CPU;
+      int64_t memory_type_id = 0;
+
+      // Allocate buffer for inference response
+      void* buffer;
+      RETURN_IF_ERROR(response_output->AllocateDataBuffer(
+          &buffer, cache_output.buffer_size_, &memory_type, &memory_type_id));
+
+      // TODO: Handle other memory types
+      if (memory_type != TRITONSERVER_MEMORY_CPU &&
+          memory_type != TRITONSERVER_MEMORY_CPU_PINNED) {
+        return Status(
+            Status::Code::INTERNAL,
+            "Only input buffers in CPU memory are allowed in cache currently");
+      }
+
+      if (buffer == nullptr) {
+        return Status(
+            Status::Code::INTERNAL, "failed to allocate buffer for output '" +
+                                        cache_output.name_ + "'");
+      }
+      // Copy cached output buffer to allocated response output buffer
+      std::memcpy(buffer, cache_output.buffer_, cache_output.buffer_size_);
+
+      // TODO: Add field to InferenceResponse to indicate this was from cache
+      // response.cached = true;
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+RequestResponseCache::HashInputBuffers(
+    const InferenceRequest::Input* input, size_t* seed)
+{
+  // Iterate over each data buffer in input in case of non-contiguous memory
+  for (size_t idx = 0; idx < input->DataBufferCount(); ++idx) {
+    const void* src_buffer;
+    size_t src_byte_size;
+    TRITONSERVER_MemoryType src_memory_type;
+    int64_t src_memory_type_id;
+
+    RETURN_IF_ERROR(input->DataBuffer(
+        idx, &src_buffer, &src_byte_size, &src_memory_type,
+        &src_memory_type_id));
+
+    // TODO: Handle other memory types
+    if (src_memory_type != TRITONSERVER_MEMORY_CPU &&
+        src_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) {
+      return Status(
+          Status::Code::INTERNAL,
+          "Only input buffers in CPU memory are allowed in cache currently");
+    }
+
+    // Add each byte of input buffer chunk to hash
+    const unsigned char* tmp = static_cast<const unsigned char*>(src_buffer);
+    for (uint64_t byte = 0; byte < src_byte_size; byte++) {
+      boost::hash_combine(*seed, tmp[byte]);
+    }
+  }
+
+  return Status::Success;
+}
+
+
+Status
+RequestResponseCache::HashInputs(const InferenceRequest& request, size_t* seed)
+{
+  const auto& inputs = request.ImmutableInputs();
+  // Convert inputs to ordered map for consistency in hashing
+  // inputs sorted by key (input) name
+  std::map<std::string, InferenceRequest::Input*> ordered_inputs(
+      inputs.begin(), inputs.end());
+  for (const auto& input : ordered_inputs) {
+    // Add input name to hash
+    boost::hash_combine(*seed, input.second->Name());
+    // Fetch input buffer for hashing raw data
+    RETURN_IF_ERROR(HashInputBuffers(input.second, seed));
+  }
+
+  return Status::Success;
+}
+
+
+Status
+RequestResponseCache::Hash(const InferenceRequest& request, uint64_t* key)
+{
+  std::size_t seed = 0;
+  // Add request model name to hash
+  boost::hash_combine(seed, request.ModelName());
+  // Add request model version to hash
+  boost::hash_combine(seed, request.ActualModelVersion());
+  RETURN_IF_ERROR(HashInputs(request, &seed));
+  *key = static_cast<uint64_t>(seed);
+  return Status::Success;
+}
+
+Status
+RequestResponseCache::HashAndSet(InferenceRequest* const request)
+{
+  uint64_t key = 0;
+  RETURN_IF_ERROR(Hash(*request, &key));
+  request->SetCacheKey(key);
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/response_cache.h
+++ b/3rdparty/core-r22.12/src/response_cache.h
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include "infer_request.h"
+#include "infer_response.h"
+#include "model.h"
+#include "status.h"
+
+#include <boost/functional/hash.hpp>
+#include <boost/interprocess/managed_external_buffer.hpp>
+
+namespace triton { namespace core {
+
+// Assuming CPU memory only for now
+struct Output {
+  // Output tensor data buffer
+  void* buffer_;
+  // Size of "buffer" above
+  uint64_t buffer_size_ = 0;
+  // Name of the output
+  std::string name_;
+  // Datatype of the output
+  inference::DataType dtype_;
+  // Shape of the output
+  std::vector<int64_t> shape_;
+};
+
+struct CacheEntry {
+  explicit CacheEntry() {}
+  // Point to key in LRU list for maintaining LRU order
+  std::list<uint64_t>::iterator lru_iter_;
+  // each output buffer = managed_buffer.allocate(size, ...)
+  std::vector<Output> outputs_;
+};
+
+class RequestResponseCache {
+ public:
+  ~RequestResponseCache();
+  // Create the request/response cache object
+  static Status Create(
+      uint64_t cache_size, std::unique_ptr<RequestResponseCache>* cache);
+  // Hash inference request for cache access and store it in "request" object.
+  // This will also be called internally in Lookup/Insert if the request hasn't
+  // already stored it's hash. It is up to the user to update the hash in the
+  // request if modifying any hashed fields of the request object after storing.
+  // Return Status object indicating success or failure.
+  Status HashAndSet(InferenceRequest* const request);
+
+  // Lookup 'request' hash in cache and return the inference response in
+  // 'response' on cache hit or nullptr on cache miss
+  // Return Status object indicating success or failure.
+  Status Lookup(
+      InferenceResponse* const response, InferenceRequest* const request);
+  // Insert response into cache, evict entries to make space if necessary
+  // Return Status object indicating success or failure.
+  Status Insert(
+      const InferenceResponse& response, InferenceRequest* const request);
+  // Evict entry from cache based on policy
+  // Return Status object indicating success or failure.
+  Status Evict();
+  // Returns number of items in cache
+  size_t NumEntries()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return cache_.size();
+  }
+  // Returns number of items evicted in cache lifespan
+  size_t NumEvictions()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return num_evictions_;
+  }
+  // Returns number of lookups in cache lifespan, should sum to hits + misses
+  size_t NumLookups()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return num_lookups_;
+  }
+  // Returns number of cache hits in cache lifespan
+  size_t NumHits()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return num_hits_;
+  }
+  // Returns number of cache hits in cache lifespan
+  size_t NumMisses()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return num_misses_;
+  }
+  // Returns the total lookup latency (nanoseconds) of all lookups in cache
+  // lifespan
+  uint64_t TotalLookupLatencyNs()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return total_lookup_latency_ns_;
+  }
+
+  uint64_t TotalInsertionLatencyNs()
+  {
+    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
+    return total_insertion_latency_ns_;
+  }
+
+  // Returns total number of bytes allocated for cache
+  size_t TotalBytes()
+  {
+    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+    return managed_buffer_.get_size();
+  }
+  // Returns number of free bytes in cache
+  size_t FreeBytes()
+  {
+    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+    return managed_buffer_.get_free_memory();
+  }
+  // Returns number of bytes in use by cache
+  size_t AllocatedBytes()
+  {
+    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+    return managed_buffer_.get_size() - managed_buffer_.get_free_memory();
+  }
+  // Returns fraction of bytes allocated over total cache size between [0, 1]
+  double TotalUtilization()
+  {
+    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
+    return static_cast<double>(AllocatedBytes()) /
+           static_cast<double>(TotalBytes());
+  }
+
+ private:
+  explicit RequestResponseCache(const uint64_t cache_size);
+  // Update LRU ordering on lookup
+  void UpdateLRU(std::unordered_map<uint64_t, CacheEntry>::iterator&);
+  // Build CacheEntry from InferenceResponse
+  Status BuildCacheEntry(
+      const InferenceResponse& response, CacheEntry* const entry);
+  // Build InferenceResponse from CacheEntry
+  Status BuildInferenceResponse(
+      const CacheEntry& entry, InferenceResponse* const response);
+  // Helper function to hash data buffers used by "input"
+  Status HashInputBuffers(const InferenceRequest::Input* input, size_t* seed);
+  // Helper function to hash each input in "request"
+  Status HashInputs(const InferenceRequest& request, size_t* seed);
+  // Helper function to hash request and store it in "key"
+  Status Hash(const InferenceRequest& request, uint64_t* key);
+
+  // Cache buffer
+  void* buffer_;
+  // Managed buffer
+  boost::interprocess::managed_external_buffer managed_buffer_;
+  // key -> CacheEntry containing values and list iterator for LRU management
+  std::unordered_map<uint64_t, CacheEntry> cache_;
+  // List of keys sorted from most to least recently used
+  std::list<uint64_t> lru_;
+  // Cache metrics
+  size_t num_evictions_ = 0;
+  size_t num_lookups_ = 0;
+  size_t num_hits_ = 0;
+  size_t num_misses_ = 0;
+  uint64_t total_lookup_latency_ns_ = 0;
+  uint64_t total_insertion_latency_ns_ = 0;
+  // Mutex for buffer synchronization
+  std::recursive_mutex buffer_mtx_;
+  // Mutex for cache synchronization
+  std::recursive_mutex cache_mtx_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/scheduler.h
+++ b/3rdparty/core-r22.12/src/scheduler.h
+// Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <functional>
+#include "infer_request.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+// Scheduler interface.
+class Scheduler {
+ public:
+  virtual ~Scheduler() {}
+
+  // The prototype for the initialization function that will be called
+  // by the "standard" schedulers created based on a model's
+  // scheduling_choice settings. The init function is called once by
+  // the runner that will later execute requests for 'runner_idx'. A
+  // non-OK error status indicates an initialization error that
+  // prevents scheduler from using the runner.
+  using StandardInitFunc = std::function<Status(uint32_t runner_idx)>;
+
+  // The prototype for the warmup function that will be called by the
+  // "standard" schedulers created based on a model's
+  // scheduling_choice settings. The warmup function is called once by
+  // the runner that will later execute requests for 'runner_idx'. A
+  // non-OK error status indicates an error that prevents scheduler
+  // from sending warmup requests to the runner.
+  using StandardWarmupFunc = std::function<Status(uint32_t runner_idx)>;
+
+  // The prototype for the run function that will be called by the
+  // "standard" schedulers created based on a model's
+  // scheduling_choice settings. The run function must accept a
+  // 'runner_idx' indicating which runner should execute the
+  // 'requests'. Ownership of the 'requests' is transferred to the
+  // runner which is responsible for generating responses and
+  // releasing the requests.
+  using StandardRunFunc = std::function<void(
+      uint32_t runner_idx,
+      std::vector<std::unique_ptr<InferenceRequest>>&& requests)>;
+
+  // Enqueue a request with the scheduler. If Status::Success is returned
+  // then the backend has taken ownership of the request object and so
+  // 'request' will be nullptr. If non-success is returned then the
+  // caller still retains ownership of 'request'.
+  virtual Status Enqueue(std::unique_ptr<InferenceRequest>& request) = 0;
+
+  // Return the number of in-flight inferences tracked by the scheduler.
+  virtual size_t InflightInferenceCount() = 0;
+
+  // Instruct the scheduler to stop processing future requests unless they are
+  // considered as in-flight.
+  virtual void Stop() = 0;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/scheduler_utils.cc
+++ b/3rdparty/core-r22.12/src/scheduler_utils.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "scheduler_utils.h"
+
+#include <cassert>
+#include "constants.h"
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+Status
+RequiredEqualInputs::Initialize(
+    const std::unique_ptr<InferenceRequest>& request,
+    const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+    const bool has_optional_input)
+{
+  has_optional_input_ = has_optional_input;
+  required_inputs_.clear();
+
+  for (const auto& pr : request->ImmutableInputs()) {
+    const InferenceRequest::Input* input = pr.second;
+    const auto itr = enforce_equal_shape_tensors.find(input->Name());
+    if (itr != enforce_equal_shape_tensors.end()) {
+      required_inputs_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(input->Name()),
+          std::forward_as_tuple(input, itr->second));
+    }
+    // When the model has optional inputs, overload 'required_inputs_'
+    // to track the inputs involved in the batch
+    else if (has_optional_input) {
+      required_inputs_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(input->Name()),
+          std::forward_as_tuple(nullptr, false));
+    }
+  }
+
+  init_ = true;
+  return Status::Success;
+}
+
+bool
+RequiredEqualInputs::HasEqualInputs(
+    const std::unique_ptr<InferenceRequest>& request)
+{
+  // If current request has different number of inputs, then dynamic batching
+  // shouldn't be applied.
+  if (has_optional_input_ &&
+      (request->ImmutableInputs().size() != required_inputs_.size())) {
+    return false;
+  }
+  for (const auto& pr : request->ImmutableInputs()) {
+    const InferenceRequest::Input* input = pr.second;
+    const auto itr = required_inputs_.find(input->Name());
+    if (itr != required_inputs_.end()) {
+      if (itr->second.first != nullptr) {
+        // Make sure shape of input tensors is equal.
+        if (!triton::common::CompareDims(
+                itr->second.first->Shape(), input->Shape())) {
+          return false;
+        }
+
+        // If necessary compare the contents as well...
+        if (itr->second.second) {
+          const auto& d1 = itr->second.first->Data();
+          const auto& d2 = input->Data();
+
+          // For now being conservative and assuming that content
+          // comparison is for shape tensors which are likely to always
+          // be in a single buffer.
+          if ((d1->BufferCount() != 1) || (d2->BufferCount() != 1)) {
+            return false;
+          }
+
+          size_t d1_byte_size, d2_byte_size;
+          TRITONSERVER_MemoryType d1_memory_type, d2_memory_type;
+          int64_t d1_memory_id, d2_memory_id;
+          const char* d1_buffer = d1->BufferAt(
+              0 /* idx */, &d1_byte_size, &d1_memory_type, &d1_memory_id);
+          const char* d2_buffer = d2->BufferAt(
+              0 /* idx */, &d2_byte_size, &d2_memory_type, &d2_memory_id);
+
+          // Tensor must be same size and in in CPU memory so that it
+          // can be easily compared. If not return false conservatively.
+          if ((d1_byte_size != d2_byte_size) || (d1_buffer == nullptr) ||
+              (d2_buffer == nullptr) ||
+              (d1_memory_type == TRITONSERVER_MEMORY_GPU) ||
+              (d2_memory_type == TRITONSERVER_MEMORY_GPU)) {
+            return false;
+          }
+
+          if (strncmp(d1_buffer, d2_buffer, d1_byte_size) != 0) {
+            return false;
+          }
+        }
+      }
+    } else if (has_optional_input_) {
+      // If the model has optional inputs, the current request must contains all
+      // inputs that in the first request (tracked in 'required_inputs_').
+      return false;
+    }
+  }
+
+  return true;
+}
+
+Status
+PriorityQueue::PolicyQueue::Enqueue(std::unique_ptr<InferenceRequest>& request)
+{
+  if ((max_queue_size_ != 0) && (Size() >= max_queue_size_)) {
+    return Status(
+        Status::Code::UNAVAILABLE,
+        request->LogRequest() + "Exceeds maximum queue size");
+  }
+
+  queue_.emplace_back(std::move(request));
+  auto timeout_us = default_timeout_us_;
+  if (allow_timeout_override_) {
+    auto override_timeout_us = queue_.back()->TimeoutMicroseconds();
+    if (override_timeout_us != 0 && override_timeout_us < timeout_us) {
+      timeout_us = override_timeout_us;
+    }
+  }
+  if (timeout_us != 0) {
+    timeout_timestamp_ns_.emplace_back(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::steady_clock::now().time_since_epoch())
+            .count() +
+        timeout_us * 1000);
+  } else {
+    timeout_timestamp_ns_.emplace_back(0);
+  }
+
+  return Status::Success;
+}
+
+Status
+PriorityQueue::PolicyQueue::Dequeue(std::unique_ptr<InferenceRequest>* request)
+{
+  if (!queue_.empty()) {
+    *request = std::move(queue_.front());
+    queue_.pop_front();
+    timeout_timestamp_ns_.pop_front();
+  } else {
+    *request = std::move(delayed_queue_.front());
+    delayed_queue_.pop_front();
+  }
+
+  return Status::Success;
+}
+
+bool
+PriorityQueue::PolicyQueue::ApplyPolicy(
+    size_t idx, size_t* rejected_count, size_t* rejected_batch_size)
+{
+  uint64_t now_nanoseconds =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::steady_clock::now().time_since_epoch())
+          .count();
+  if (idx < queue_.size()) {
+    size_t curr_idx = idx;
+    while (curr_idx < queue_.size()) {
+      if ((timeout_timestamp_ns_[curr_idx] != 0) &&
+          (now_nanoseconds > timeout_timestamp_ns_[curr_idx])) {
+        if (timeout_action_ == inference::ModelQueuePolicy::DELAY) {
+          delayed_queue_.emplace_back(std::move(queue_[curr_idx]));
+        } else {
+          rejected_queue_.emplace_back(std::move(queue_[curr_idx]));
+          *rejected_count += 1;
+          *rejected_batch_size +=
+              std::max(1U, rejected_queue_.back()->BatchSize());
+        }
+        curr_idx++;
+      } else {
+        break;
+      }
+    }
+
+    // Use range erasure on deque as all erasure functions are linear,
+    // this implies in the edge case where this function is always called on
+    // 'bad' index can be O(n^2). However, for data structures that are O(1)
+    // erasure, the traversal may not be as efficient due to cache miss
+    // (elements not stored contiguously).
+    queue_.erase(queue_.begin() + idx, queue_.begin() + curr_idx);
+    timeout_timestamp_ns_.erase(
+        timeout_timestamp_ns_.begin() + idx,
+        timeout_timestamp_ns_.begin() + curr_idx);
+
+    // Current idx is pointing to an item with unexpired timeout
+    if (idx < queue_.size()) {
+      return true;
+    }
+  }
+  // At this point, idx is pointing to an item with expired timeout.
+  // If the item is in delayed queue, then return true. Otherwise, false
+  // meaning the queue has no item with this 'idx'.
+  return ((idx - queue_.size()) < delayed_queue_.size());
+}
+
+void
+PriorityQueue::PolicyQueue::ReleaseRejectedQueue(
+    std::deque<std::unique_ptr<InferenceRequest>>* requests)
+{
+  rejected_queue_.swap(*requests);
+}
+
+const std::unique_ptr<InferenceRequest>&
+PriorityQueue::PolicyQueue::At(size_t idx) const
+{
+  if (idx < queue_.size()) {
+    return queue_[idx];
+  } else {
+    return delayed_queue_[idx - queue_.size()];
+  }
+}
+
+uint64_t
+PriorityQueue::PolicyQueue::TimeoutAt(size_t idx)
+{
+  if (idx < queue_.size()) {
+    return timeout_timestamp_ns_[idx];
+  } else {
+    return 0;
+  }
+}
+
+PriorityQueue::PriorityQueue()
+    : size_(0), front_priority_level_(0), last_priority_level_(0)
+{
+  inference::ModelQueuePolicy default_policy;
+  queues_.emplace(0, PolicyQueue(default_policy));
+  front_priority_level_ = queues_.begin()->first;
+  ResetCursor();
+}
+
+PriorityQueue::PriorityQueue(
+    const inference::ModelQueuePolicy& default_queue_policy,
+    uint32_t priority_levels, const ModelQueuePolicyMap queue_policy_map)
+    : size_(0), last_priority_level_(priority_levels)
+{
+  if (priority_levels == 0) {
+    queues_.emplace(0, PolicyQueue(default_queue_policy));
+  } else {
+    for (uint32_t level = 1; level <= priority_levels; level++) {
+      auto it = queue_policy_map.find(level);
+      if (it == queue_policy_map.end()) {
+        queues_.emplace(level, PolicyQueue(default_queue_policy));
+      } else {
+        queues_.emplace(level, PolicyQueue(it->second));
+      }
+    }
+  }
+  front_priority_level_ = queues_.begin()->first;
+  ResetCursor();
+}
+
+Status
+PriorityQueue::Enqueue(
+    uint32_t priority_level, std::unique_ptr<InferenceRequest>& request)
+{
+  auto status = queues_[priority_level].Enqueue(request);
+  if (status.IsOk()) {
+    size_++;
+    front_priority_level_ = std::min(front_priority_level_, priority_level);
+    // Invalidate the pending batch cursor if the enqueued item is placed
+    // within the pending batch. At the same priority level the request is
+    // guaranteed to be after pending batch if the batch hasn't reached
+    // delayed queue.
+    if ((priority_level < pending_cursor_.curr_it_->first) ||
+        ((priority_level == pending_cursor_.curr_it_->first) &&
+         (pending_cursor_.at_delayed_queue_))) {
+      pending_cursor_.valid_ = false;
+    }
+  }
+
+  return status;
+}
+
+Status
+PriorityQueue::Dequeue(std::unique_ptr<InferenceRequest>* request)
+{
+  pending_cursor_.valid_ = false;
+  while (true) {
+    if (!queues_[front_priority_level_].Empty()) {
+      RETURN_IF_ERROR(queues_[front_priority_level_].Dequeue(request));
+      size_--;
+      return Status::Success;
+    } else if (front_priority_level_ != last_priority_level_) {
+      front_priority_level_++;
+      continue;
+    }
+
+    // Control reach here if the queue for last priority level is also
+    // empty, then return error below.
+    break;
+  }
+
+  return Status(
+      Status::Code::UNAVAILABLE,
+      (*request)->LogRequest() + "dequeue on empty queue");
+}
+
+void
+PriorityQueue::ReleaseRejectedRequests(
+    std::shared_ptr<std::vector<std::deque<std::unique_ptr<InferenceRequest>>>>*
+        requests)
+{
+  auto res = std::make_shared<
+      std::vector<std::deque<std::unique_ptr<InferenceRequest>>>>(
+      queues_.size());
+  size_t idx = 0;
+  for (auto& queue : queues_) {
+    queue.second.ReleaseRejectedQueue(&((*res)[idx]));
+    idx++;
+  }
+
+  requests->swap(res);
+}
+
+bool
+PriorityQueue::IsCursorValid()
+{
+  if (pending_cursor_.valid_) {
+    return (uint64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::steady_clock::now().time_since_epoch())
+               .count() < pending_cursor_.pending_batch_closest_timeout_ns_;
+  }
+  return false;
+}
+
+PriorityQueue::Cursor::Cursor(PriorityQueues::iterator start_it)
+    : curr_it_(start_it), queue_idx_(0), at_delayed_queue_(false),
+      pending_batch_closest_timeout_ns_(0),
+      pending_batch_oldest_enqueue_time_ns_(0), pending_batch_count_(0),
+      valid_(true)
+{
+}
+
+size_t
+PriorityQueue::ApplyPolicyAtCursor()
+{
+  size_t rejected_batch_size = 0;
+  size_t rejected_count = 0;
+  while (pending_cursor_.curr_it_ != queues_.end()) {
+    if (!(pending_cursor_.curr_it_->second.ApplyPolicy(
+            pending_cursor_.queue_idx_, &rejected_count,
+            &rejected_batch_size))) {
+      if (size_ > pending_cursor_.pending_batch_count_ + rejected_count) {
+        pending_cursor_.curr_it_++;
+        pending_cursor_.queue_idx_ = 0;
+        continue;
+      }
+    }
+    // Control reach here if the cursor points to a request that is candidate
+    // for pending batch, or if all requests are in pending batch.
+    break;
+  }
+  size_ -= rejected_count;
+  return rejected_batch_size;
+}
+
+void
+PriorityQueue::AdvanceCursor()
+{
+  if (pending_cursor_.pending_batch_count_ >= size_) {
+    return;
+  }
+
+  const auto& timeout_ns =
+      pending_cursor_.curr_it_->second.TimeoutAt(pending_cursor_.queue_idx_);
+  if (timeout_ns != 0) {
+    if (pending_cursor_.pending_batch_closest_timeout_ns_ != 0) {
+      pending_cursor_.pending_batch_closest_timeout_ns_ = std::min(
+          pending_cursor_.pending_batch_closest_timeout_ns_, timeout_ns);
+    } else {
+      pending_cursor_.pending_batch_closest_timeout_ns_ = timeout_ns;
+    }
+  }
+
+  uint64_t curr_enqueue_time_ns =
+      pending_cursor_.curr_it_->second.At(pending_cursor_.queue_idx_)
+          ->BatcherStartNs();
+  if (pending_cursor_.pending_batch_oldest_enqueue_time_ns_ != 0) {
+    pending_cursor_.pending_batch_oldest_enqueue_time_ns_ = std::min(
+        pending_cursor_.pending_batch_oldest_enqueue_time_ns_,
+        curr_enqueue_time_ns);
+  } else {
+    pending_cursor_.pending_batch_oldest_enqueue_time_ns_ =
+        curr_enqueue_time_ns;
+  }
+  ++pending_cursor_.queue_idx_;
+  ++pending_cursor_.pending_batch_count_;
+  // pending batch includes delayed request if (queue_idx_ - 1) points to
+  // delayed queue.
+  pending_cursor_.at_delayed_queue_ =
+      (pending_cursor_.queue_idx_ >
+       pending_cursor_.curr_it_->second.UnexpiredSize());
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/scheduler_utils.h
+++ b/3rdparty/core-r22.12/src/scheduler_utils.h
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <deque>
+#include <unordered_map>
+#include "scheduler.h"
+
+namespace triton { namespace core {
+
+struct RequiredEqualInputs {
+ public:
+  RequiredEqualInputs() : init_(false), has_optional_input_(false) {}
+  Status Initialize(
+      const std::unique_ptr<InferenceRequest>& request,
+      const std::unordered_map<std::string, bool>& enforce_equal_shape_tensors,
+      const bool has_optional_input);
+  bool HasEqualInputs(const std::unique_ptr<InferenceRequest>& request);
+  bool Initialized() { return init_; };
+
+ private:
+  bool init_;
+  bool has_optional_input_;
+  // A collection of inputs in the request, an nullptr for
+  // InferenceRequest::Input indicates that the inputs doesn't require
+  // equality check
+  std::unordered_map<
+      std::string,
+      std::pair<const InferenceRequest::Input*, bool /* compare contents */>>
+      required_inputs_;
+};
+
+//
+// PriorityQueue
+//
+using ModelQueuePolicyMap = ::google::protobuf::Map<
+    ::google::protobuf::uint32, inference::ModelQueuePolicy>;
+
+class PriorityQueue {
+ public:
+  // Construct a queue with no priority level with default queue policy,
+  // which will behave the same as regular queue.
+  PriorityQueue();
+
+  // Construct a queue with 'priority_levels', the priority starts from 1.
+  // Different priority level may follow different queue policies given by
+  // 'queue_policy_map', otherwise, the 'default_queue_policy' will be used.
+  PriorityQueue(
+      const inference::ModelQueuePolicy& default_queue_policy,
+      uint32_t priority_levels, const ModelQueuePolicyMap queue_policy_map);
+
+  // Enqueue a request with priority set to 'priority_level'. If
+  // Status::Success is returned then the queue has taken ownership of
+  // the request object and so 'request' will be nullptr. If
+  // non-success is returned then the caller still retains ownership
+  // of 'request'.
+  Status Enqueue(
+      uint32_t priority_level, std::unique_ptr<InferenceRequest>& request);
+
+  // Dequeue the request at the front of the queue.
+  Status Dequeue(std::unique_ptr<InferenceRequest>* request);
+
+  // Retrieve the requests that are rejected based on the queue policies.
+  void ReleaseRejectedRequests(
+      std::shared_ptr<
+          std::vector<std::deque<std::unique_ptr<InferenceRequest>>>>*
+          requests);
+
+  // Return the number of requests in the queue, rejected requests are
+  // not included.
+  size_t Size() { return size_; }
+
+  // Is the queue is empty? Rejected requests are not included.
+  bool Empty() { return Size() == 0; }
+
+  // Reset the cursor such that it is representing an empty pending batch.
+  void ResetCursor() { pending_cursor_ = Cursor(queues_.begin()); }
+
+  // Record the current cursor. The cursor can be restored to recorded state
+  // by invoking SetCursorToMark(). Note that Enqueue(), Dequeue(), and
+  // ResetCursor() will invalidate the marker, it is the function caller's
+  // responsibility to ensure the marker is valid before calling
+  // SetCursorToMark().
+  void MarkCursor() { current_mark_ = pending_cursor_; }
+
+  // Apply the queue policy and alter the underlying queue accordingly. After
+  // the function returns, the cursor may be at its end to indicate that
+  // there no request after the pending batch.
+  // Returns the total batch size of the newly rejected requests.
+  size_t ApplyPolicyAtCursor();
+
+  // Return the request at the cursor.
+  const std::unique_ptr<InferenceRequest>& RequestAtCursor()
+  {
+    return pending_cursor_.curr_it_->second.At(pending_cursor_.queue_idx_);
+  }
+
+  // Advance the cursor for pending batch. This function will not trigger the
+  // queue policy. No effect if the cursor already reach the end of the queue.
+  void AdvanceCursor();
+
+  // Whether the cursor reaches its end,
+  bool CursorEnd() { return pending_cursor_.pending_batch_count_ == size_; }
+
+  // Restore the cursor state to the marker.
+  void SetCursorToMark() { pending_cursor_ = current_mark_; }
+
+  // Whether the cursor is still valid. The cursor is valid only if the pending
+  // batch is unchanged.
+  bool IsCursorValid();
+
+  // Return the oldest queued time of requests in pending batch.
+  uint64_t OldestEnqueueTime()
+  {
+    return pending_cursor_.pending_batch_oldest_enqueue_time_ns_;
+  }
+
+  // Return the closest timeout of requests in pending batch.
+  uint64_t ClosestTimeout()
+  {
+    return pending_cursor_.pending_batch_closest_timeout_ns_;
+  }
+
+  // Return the number of requests in pending batch.
+  size_t PendingBatchCount() { return pending_cursor_.pending_batch_count_; }
+
+ private:
+  class PolicyQueue {
+   public:
+    // Construct a policy queue with default policy, which will behave the same
+    // as regular queue.
+    PolicyQueue()
+        : timeout_action_(inference::ModelQueuePolicy::REJECT),
+          default_timeout_us_(0), allow_timeout_override_(false),
+          max_queue_size_(0)
+    {
+    }
+
+    // Construct a policy queue with given 'policy'.
+    PolicyQueue(const inference::ModelQueuePolicy& policy)
+        : timeout_action_(policy.timeout_action()),
+          default_timeout_us_(policy.default_timeout_microseconds()),
+          allow_timeout_override_(policy.allow_timeout_override()),
+          max_queue_size_(policy.max_queue_size())
+    {
+    }
+
+    // Enqueue a request and set up its timeout accordingly. If
+    // Status::Success is returned then the queue has taken ownership
+    // of the request object and so 'request' will be nullptr. If
+    // non-success is returned then the caller still retains ownership
+    // of 'request'.
+    Status Enqueue(std::unique_ptr<InferenceRequest>& request);
+
+    // Dequeue the request at the front of the queue.
+    Status Dequeue(std::unique_ptr<InferenceRequest>* request);
+
+    // Apply the queue policy to the request at 'idx'.
+    // 'rejected_count' will be incremented by the number of the newly rejected
+    // requets after applying the policy.
+    // 'rejected_batch_size' will be incremented by the total batch size of the
+    // newly rejected requests after applying the policy.
+    // Return true if the 'idx' still points to a request after applying the
+    // policy, false otherwise.
+    bool ApplyPolicy(
+        size_t idx, size_t* rejected_count, size_t* rejected_batch_size);
+
+    // Return the rejected requests held by the queue.
+    void ReleaseRejectedQueue(
+        std::deque<std::unique_ptr<InferenceRequest>>* requests);
+
+    // Return the request at 'idx'.
+    const std::unique_ptr<InferenceRequest>& At(size_t idx) const;
+
+    // Return the timeout timestamp of the request at 'idx', in ns. A value of 0
+    // indicates that the request doesn't specify a timeout.
+    uint64_t TimeoutAt(size_t idx);
+
+    // Return whether the queue is empty, rejected requests are not included.
+    bool Empty() { return Size() == 0; }
+
+    // Return the number of requests in the queue, rejected requests are not
+    // included.
+    size_t Size() { return queue_.size() + delayed_queue_.size(); }
+
+    // Return the number of unexpired requests in the queue
+    size_t UnexpiredSize() { return queue_.size(); }
+
+   private:
+    // Variables that define the policy for the queue
+    const inference::ModelQueuePolicy::TimeoutAction timeout_action_;
+    const uint64_t default_timeout_us_;
+    const bool allow_timeout_override_;
+    const uint32_t max_queue_size_;
+
+    std::deque<uint64_t> timeout_timestamp_ns_;
+    std::deque<std::unique_ptr<InferenceRequest>> queue_;
+    std::deque<std::unique_ptr<InferenceRequest>> delayed_queue_;
+    std::deque<std::unique_ptr<InferenceRequest>> rejected_queue_;
+  };
+  using PriorityQueues = std::map<uint32_t, PolicyQueue>;
+
+  // Cursor for tracking pending batch, the cursor points to the item after
+  // the pending batch.
+  struct Cursor {
+    Cursor() = default;
+    Cursor(PriorityQueues::iterator start_it);
+
+    Cursor(const Cursor& rhs) = default;
+    Cursor& operator=(const Cursor& rhs) = default;
+
+    PriorityQueues::iterator curr_it_;
+    size_t queue_idx_;
+    bool at_delayed_queue_;
+    uint64_t pending_batch_closest_timeout_ns_;
+    uint64_t pending_batch_oldest_enqueue_time_ns_;
+    size_t pending_batch_count_;
+    bool valid_;
+  };
+
+  PriorityQueues queues_;
+  size_t size_;
+
+  // Keep track of the priority level that the first request in the queue
+  // is at to avoid traversing 'queues_'
+  uint32_t front_priority_level_;
+  uint32_t last_priority_level_;
+
+  Cursor pending_cursor_;
+  Cursor current_mark_;
+};
+
+}}  // namespace triton::core