Adapt to 0.1.0

0a21fff9 · xiabo · 9484fd1c · 0a21fff9 · 0a21fff9 · 0a21fff9
Commit 0a21fff9 authored Dec 20, 2023 by xiabo
20 changed files
--- a/3rdparty/core-r22.12/src/infer_trace.h
+++ b/3rdparty/core-r22.12/src/infer_trace.h
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <memory>
+#include "constants.h"
+#include "status.h"
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+#ifdef TRITON_ENABLE_TRACING
+
+//
+// InferenceTrace
+//
+// Interface to TRITONSERVER_InferenceTrace to report trace events.
+//
+class InferenceTrace {
+ public:
+  InferenceTrace(
+      const TRITONSERVER_InferenceTraceLevel level, const uint64_t parent_id,
+      TRITONSERVER_InferenceTraceActivityFn_t activity_fn,
+      TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn,
+      TRITONSERVER_InferenceTraceReleaseFn_t release_fn, void* userp)
+      : level_(level), id_(next_id_++), parent_id_(parent_id),
+        activity_fn_(activity_fn), tensor_activity_fn_(tensor_activity_fn),
+        release_fn_(release_fn), userp_(userp)
+  {
+  }
+
+  InferenceTrace* SpawnChildTrace();
+
+  int64_t Id() const { return id_; }
+  int64_t ParentId() const { return parent_id_; }
+
+  const std::string& ModelName() const { return model_name_; }
+  int64_t ModelVersion() const { return model_version_; }
+
+  void SetModelName(const std::string& n) { model_name_ = n; }
+  void SetModelVersion(int64_t v) { model_version_ = v; }
+
+  // Report trace activity.
+  void Report(
+      const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns)
+  {
+    if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) {
+      activity_fn_(
+          reinterpret_cast<TRITONSERVER_InferenceTrace*>(this), activity,
+          timestamp_ns, userp_);
+    }
+  }
+
+  // Report trace activity at the current time.
+  void ReportNow(const TRITONSERVER_InferenceTraceActivity activity)
+  {
+    if ((level_ & TRITONSERVER_TRACE_LEVEL_TIMESTAMPS) > 0) {
+      Report(
+          activity, std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        std::chrono::steady_clock::now().time_since_epoch())
+                        .count());
+    }
+  }
+
+  // Report tensor trace activity.
+  void ReportTensor(
+      const TRITONSERVER_InferenceTraceActivity activity, const char* name,
+      TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
+      const int64_t* shape, uint64_t dim_count,
+      TRITONSERVER_MemoryType memory_type, int64_t memory_type_id)
+  {
+    if ((level_ & TRITONSERVER_TRACE_LEVEL_TENSORS) > 0) {
+      tensor_activity_fn_(
+          reinterpret_cast<TRITONSERVER_InferenceTrace*>(this), activity, name,
+          datatype, base, byte_size, shape, dim_count, memory_type,
+          memory_type_id, userp_);
+    }
+  }
+
+  // Release the trace. Call the trace release callback.
+  void Release();
+
+ private:
+  const TRITONSERVER_InferenceTraceLevel level_;
+  const uint64_t id_;
+  const uint64_t parent_id_;
+
+  TRITONSERVER_InferenceTraceActivityFn_t activity_fn_;
+  TRITONSERVER_InferenceTraceTensorActivityFn_t tensor_activity_fn_;
+  TRITONSERVER_InferenceTraceReleaseFn_t release_fn_;
+  void* userp_;
+
+  std::string model_name_;
+  int64_t model_version_;
+
+  // Maintain next id statically so that trace id is unique even
+  // across traces
+  static std::atomic<uint64_t> next_id_;
+};
+
+//
+// InferenceTraceProxy
+//
+// Object attached as shared_ptr to InferenceRequest and
+// InferenceResponse(s) being traced as part of a single inference
+// request.
+//
+class InferenceTraceProxy {
+ public:
+  InferenceTraceProxy(InferenceTrace* trace) : trace_(trace) {}
+  ~InferenceTraceProxy() { trace_->Release(); }
+  int64_t Id() const { return trace_->Id(); }
+  int64_t ParentId() const { return trace_->ParentId(); }
+  const std::string& ModelName() const { return trace_->ModelName(); }
+  int64_t ModelVersion() const { return trace_->ModelVersion(); }
+  void SetModelName(const std::string& n) { trace_->SetModelName(n); }
+  void SetModelVersion(int64_t v) { trace_->SetModelVersion(v); }
+
+  void Report(
+      const TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns)
+  {
+    trace_->Report(activity, timestamp_ns);
+  }
+
+  void ReportNow(const TRITONSERVER_InferenceTraceActivity activity)
+  {
+    trace_->ReportNow(activity);
+  }
+
+  void ReportTensor(
+      const TRITONSERVER_InferenceTraceActivity activity, const char* name,
+      TRITONSERVER_DataType datatype, const void* base, size_t byte_size,
+      const int64_t* shape, uint64_t dim_count,
+      TRITONSERVER_MemoryType memory_type, int64_t memory_type_id)
+  {
+    trace_->ReportTensor(
+        activity, name, datatype, base, byte_size, shape, dim_count,
+        memory_type, memory_type_id);
+  }
+
+  std::shared_ptr<InferenceTraceProxy> SpawnChildTrace();
+
+ private:
+  InferenceTrace* trace_;
+};
+
+#endif  // TRITON_ENABLE_TRACING
+
+//
+// Macros to generate trace activity
+//
+#ifdef TRITON_ENABLE_TRACING
+#define INFER_TRACE_ACTIVITY(T, A, TS_NS) \
+  {                                       \
+    const auto& trace = (T);              \
+    const auto ts_ns = (TS_NS);           \
+    if (trace != nullptr) {               \
+      trace->Report(A, ts_ns);            \
+    }                                     \
+  }
+#define INFER_TRACE_ACTIVITY_NOW(T, A) \
+  {                                    \
+    const auto& trace = (T);           \
+    if (trace != nullptr) {            \
+      trace->ReportNow(A);             \
+    }                                  \
+  }
+#define INFER_TRACE_TENSOR_ACTIVITY(T, A, N, D, BA, BY, S, DI, MT, MTI) \
+  {                                                                     \
+    const auto& trace = (T);                                            \
+    if (trace != nullptr) {                                             \
+      trace->ReportTensor(A, N, D, BA, BY, S, DI, MT, MTI);             \
+    }                                                                   \
+  }
+#else
+#define INFER_TRACE_ACTIVITY(T, A, TS_NS)
+#define INFER_TRACE_ACTIVITY_NOW(T, A)
+#define INFER_TRACE_TENSOR_ACTIVITY(T, A, N, D, BA, BY, S, DI, MT, MTI)
+#endif  // TRITON_ENABLE_TRACING
+}}      // namespace triton::core
--- a/3rdparty/core-r22.12/src/instance_queue.cc
+++ b/3rdparty/core-r22.12/src/instance_queue.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "instance_queue.h"
+
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+InstanceQueue::InstanceQueue(size_t max_batch_size, uint64_t max_queue_delay_ns)
+    : max_batch_size_(max_batch_size), max_queue_delay_ns_(max_queue_delay_ns)
+{
+}
+
+size_t
+InstanceQueue::Size()
+{
+  return payload_queue_.size();
+}
+
+bool
+InstanceQueue::Empty()
+{
+  return payload_queue_.empty();
+}
+
+void
+InstanceQueue::Enqueue(const std::shared_ptr<Payload>& payload)
+{
+  payload_queue_.push_back(payload);
+}
+
+void
+InstanceQueue::Dequeue(
+    std::shared_ptr<Payload>* payload,
+    std::vector<std::shared_ptr<Payload>>* merged_payloads)
+{
+  *payload = payload_queue_.front();
+  payload_queue_.pop_front();
+  {
+    std::lock_guard<std::mutex> exec_lock(*((*payload)->GetExecMutex()));
+    (*payload)->SetState(Payload::State::EXECUTING);
+    if ((!payload_queue_.empty()) && (max_queue_delay_ns_ > 0) &&
+        (max_batch_size_ > 1) && (!(*payload)->IsSaturated())) {
+      bool continue_merge;
+      do {
+        continue_merge = false;
+        uint64_t now_ns =
+            std::chrono::duration_cast<std::chrono::nanoseconds>(
+                std::chrono::steady_clock::now().time_since_epoch())
+                .count();
+        size_t batch_size = (*payload)->BatchSize();
+        if ((!payload_queue_.empty()) &&
+            (!payload_queue_.front()->IsSaturated()) &&
+            (now_ns - payload_queue_.front()->BatcherStartNs()) >
+                max_queue_delay_ns_) {
+          std::lock_guard<std::mutex> exec_lock(
+              *(payload_queue_.front()->GetExecMutex()));
+          payload_queue_.front()->SetState(Payload::State::EXECUTING);
+          size_t front_batch_size = payload_queue_.front()->BatchSize();
+          if ((batch_size + front_batch_size) <= max_batch_size_) {
+            const auto& status =
+                (*payload)->MergePayload(payload_queue_.front());
+            if (status.IsOk()) {
+              merged_payloads->push_back(payload_queue_.front());
+              payload_queue_.pop_front();
+              continue_merge = true;
+            }
+          }
+        }
+      } while (continue_merge);
+    }
+  }
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/instance_queue.h
+++ b/3rdparty/core-r22.12/src/instance_queue.h
+// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "payload.h"
+
+namespace triton { namespace core {
+
+//
+// InstanceQueue
+//
+// A queue implementation holding Payloads ready to be scheduled on
+// model instance.
+class InstanceQueue {
+ public:
+  explicit InstanceQueue(size_t max_batch_size, uint64_t max_queue_delay_ns);
+
+  size_t Size();
+  bool Empty();
+  void Enqueue(const std::shared_ptr<Payload>& payload);
+  void Dequeue(
+      std::shared_ptr<Payload>* payload,
+      std::vector<std::shared_ptr<Payload>>* merged_payloads);
+
+ private:
+  size_t max_batch_size_;
+  uint64_t max_queue_delay_ns_;
+
+  std::deque<std::shared_ptr<Payload>> payload_queue_;
+  std::shared_ptr<Payload> staged_payload_;
+  std::mutex mu_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/label_provider.cc
+++ b/3rdparty/core-r22.12/src/label_provider.cc
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "label_provider.h"
+
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include "filesystem.h"
+
+namespace triton { namespace core {
+
+const std::string&
+LabelProvider::GetLabel(const std::string& name, size_t index) const
+{
+  static const std::string not_found;
+
+  auto itr = label_map_.find(name);
+  if (itr == label_map_.end()) {
+    return not_found;
+  }
+
+  if (itr->second.size() <= index) {
+    return not_found;
+  }
+
+  return itr->second[index];
+}
+
+Status
+LabelProvider::AddLabels(const std::string& name, const std::string& filepath)
+{
+  std::string label_file_contents;
+  RETURN_IF_ERROR(ReadTextFile(filepath, &label_file_contents));
+
+  auto p = label_map_.insert(std::make_pair(name, std::vector<std::string>()));
+  if (!p.second) {
+    return Status(
+        Status::Code::INTERNAL, "multiple label files for '" + name + "'");
+  }
+
+  auto itr = p.first;
+
+  std::istringstream label_file_stream(label_file_contents);
+  std::string line;
+  while (std::getline(label_file_stream, line)) {
+    itr->second.push_back(line);
+  }
+
+  return Status::Success;
+}
+
+const std::vector<std::string>&
+LabelProvider::GetLabels(const std::string& name)
+{
+  static const std::vector<std::string> not_found;
+  auto itr = label_map_.find(name);
+  if (itr == label_map_.end()) {
+    return not_found;
+  }
+  return itr->second;
+}
+
+Status
+LabelProvider::AddLabels(
+    const std::string& name, const std::vector<std::string>& labels)
+{
+  label_map_.emplace(name, labels);
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/label_provider.h
+++ b/3rdparty/core-r22.12/src/label_provider.h
+// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "constants.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+// Provides classification labels.
+class LabelProvider {
+ public:
+  LabelProvider() = default;
+
+  // Return the label associated with 'name' for a given
+  // 'index'. Return empty string if no label is available.
+  const std::string& GetLabel(const std::string& name, size_t index) const;
+
+  // Associate with 'name' a set of labels initialized from a given
+  // 'filepath'. Within the file each label is specified on its own
+  // line. The first label (line 0) is the index-0 label, the second
+  // label (line 1) is the index-1 label, etc.
+  Status AddLabels(const std::string& name, const std::string& filepath);
+
+  // Return the labels associated with 'name'. Return empty vector if no labels
+  // are available.
+  const std::vector<std::string>& GetLabels(const std::string& name);
+
+  // Associate with 'name' a set of 'labels'
+  Status AddLabels(
+      const std::string& name, const std::vector<std::string>& labels);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(LabelProvider);
+
+  std::unordered_map<std::string, std::vector<std::string>> label_map_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/libtritonserver.ldscript
+++ b/3rdparty/core-r22.12/src/libtritonserver.ldscript
+# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONSERVER_*;
+    TRITONBACKEND_*;
+    TRITONREPOAGENT_*;
+  local: *;
+};
--- a/3rdparty/core-r22.12/src/memory.cc
+++ b/3rdparty/core-r22.12/src/memory.cc
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "memory.h"
+
+#include "pinned_memory_manager.h"
+#include "triton/common/logging.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#include "cuda_memory_manager.h"
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace core {
+
+//
+// MemoryReference
+//
+MemoryReference::MemoryReference() : Memory() {}
+
+const char*
+MemoryReference::BufferAt(
+    size_t idx, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id) const
+{
+  if (idx >= buffer_.size()) {
+    *byte_size = 0;
+    *memory_type = TRITONSERVER_MEMORY_CPU;
+    *memory_type_id = 0;
+    return nullptr;
+  }
+  *memory_type = buffer_[idx].buffer_attributes_.MemoryType();
+  *memory_type_id = buffer_[idx].buffer_attributes_.MemoryTypeId();
+  *byte_size = buffer_[idx].buffer_attributes_.ByteSize();
+  return buffer_[idx].buffer_;
+}
+
+const char*
+MemoryReference::BufferAt(size_t idx, BufferAttributes** buffer_attributes)
+{
+  if (idx >= buffer_.size()) {
+    *buffer_attributes = nullptr;
+    return nullptr;
+  }
+
+  *buffer_attributes = &(buffer_[idx].buffer_attributes_);
+  return buffer_[idx].buffer_;
+}
+
+size_t
+MemoryReference::AddBuffer(
+    const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  total_byte_size_ += byte_size;
+  buffer_count_++;
+  buffer_.emplace_back(buffer, byte_size, memory_type, memory_type_id);
+  return buffer_.size() - 1;
+}
+
+size_t
+MemoryReference::AddBuffer(
+    const char* buffer, BufferAttributes* buffer_attributes)
+{
+  total_byte_size_ += buffer_attributes->ByteSize();
+  buffer_count_++;
+  buffer_.emplace_back(buffer, buffer_attributes);
+  return buffer_.size() - 1;
+}
+
+size_t
+MemoryReference::AddBufferFront(
+    const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  total_byte_size_ += byte_size;
+  buffer_count_++;
+  buffer_.emplace(
+      buffer_.begin(), buffer, byte_size, memory_type, memory_type_id);
+  return buffer_.size() - 1;
+}
+
+//
+// MutableMemory
+//
+MutableMemory::MutableMemory(
+    char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+    : Memory(), buffer_(buffer),
+      buffer_attributes_(
+          BufferAttributes(byte_size, memory_type, memory_type_id, nullptr))
+{
+  total_byte_size_ = byte_size;
+  buffer_count_ = (byte_size == 0) ? 0 : 1;
+}
+
+const char*
+MutableMemory::BufferAt(
+    size_t idx, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
+    int64_t* memory_type_id) const
+{
+  if (idx != 0) {
+    *byte_size = 0;
+    *memory_type = TRITONSERVER_MEMORY_CPU;
+    *memory_type_id = 0;
+    return nullptr;
+  }
+  *byte_size = total_byte_size_;
+  *memory_type = buffer_attributes_.MemoryType();
+  *memory_type_id = buffer_attributes_.MemoryTypeId();
+  return buffer_;
+}
+
+const char*
+MutableMemory::BufferAt(size_t idx, BufferAttributes** buffer_attributes)
+{
+  if (idx != 0) {
+    *buffer_attributes = nullptr;
+    return nullptr;
+  }
+
+  *buffer_attributes = &buffer_attributes_;
+  return buffer_;
+}
+
+char*
+MutableMemory::MutableBuffer(
+    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)
+{
+  if (memory_type != nullptr) {
+    *memory_type = buffer_attributes_.MemoryType();
+  }
+  if (memory_type_id != nullptr) {
+    *memory_type_id = buffer_attributes_.MemoryTypeId();
+  }
+
+  return buffer_;
+}
+
+//
+// AllocatedMemory
+//
+AllocatedMemory::AllocatedMemory(
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+    : MutableMemory(nullptr, byte_size, memory_type, memory_type_id)
+{
+  if (total_byte_size_ != 0) {
+    // Allocate memory with the following fallback policy:
+    // CUDA memory -> pinned system memory -> non-pinned system memory
+    switch (buffer_attributes_.MemoryType()) {
+#ifdef TRITON_ENABLE_GPU
+      case TRITONSERVER_MEMORY_GPU: {
+        auto status = CudaMemoryManager::Alloc(
+            (void**)&buffer_, total_byte_size_,
+            buffer_attributes_.MemoryTypeId());
+        if (!status.IsOk()) {
+          static bool warning_logged = false;
+          if (!warning_logged) {
+            LOG_WARNING << status.Message()
+                        << ", falling back to pinned system memory";
+            warning_logged = true;
+          }
+
+          goto pinned_memory_allocation;
+        }
+        break;
+      }
+      pinned_memory_allocation:
+#endif  // TRITON_ENABLE_GPU
+      default: {
+        TRITONSERVER_MemoryType memory_type = buffer_attributes_.MemoryType();
+        auto status = PinnedMemoryManager::Alloc(
+            (void**)&buffer_, total_byte_size_, &memory_type, true);
+        buffer_attributes_.SetMemoryType(memory_type);
+        if (!status.IsOk()) {
+          LOG_ERROR << status.Message();
+          buffer_ = nullptr;
+        }
+        break;
+      }
+    }
+  }
+  total_byte_size_ = (buffer_ == nullptr) ? 0 : total_byte_size_;
+}
+
+AllocatedMemory::~AllocatedMemory()
+{
+  if (buffer_ != nullptr) {
+    switch (buffer_attributes_.MemoryType()) {
+      case TRITONSERVER_MEMORY_GPU: {
+#ifdef TRITON_ENABLE_GPU
+        auto status =
+            CudaMemoryManager::Free(buffer_, buffer_attributes_.MemoryTypeId());
+        if (!status.IsOk()) {
+          LOG_ERROR << status.Message();
+        }
+#endif  // TRITON_ENABLE_GPU
+        break;
+      }
+
+      default: {
+        auto status = PinnedMemoryManager::Free(buffer_);
+        if (!status.IsOk()) {
+          LOG_ERROR << status.Message();
+          buffer_ = nullptr;
+        }
+        break;
+      }
+    }
+    buffer_ = nullptr;
+  }
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/memory.h
+++ b/3rdparty/core-r22.12/src/memory.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <vector>
+#include "buffer_attributes.h"
+#include "constants.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+//
+// Memory used to access data in inference requests
+//
+class Memory {
+ public:
+  // Get the 'idx'-th data block in the buffer. Using index to avoid
+  // maintaining internal state such that one buffer can be shared
+  // across multiple providers.
+  // 'idx' zero base index. Valid indices are continuous.
+  // 'byte_size' returns the byte size of the chunk of bytes.
+  // 'memory_type' returns the memory type of the chunk of bytes.
+  // 'memory_type_id' returns the memory type id of the chunk of bytes.
+  // Return the pointer to the data block. Returns nullptr if 'idx' is
+  // out of range
+  virtual const char* BufferAt(
+      size_t idx, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
+      int64_t* memory_type_id) const = 0;
+
+  // Similar to the above BufferAt but with BufferAttributes.
+  virtual const char* BufferAt(
+      size_t idx, BufferAttributes** buffer_attributes) = 0;
+
+  // Get the number of contiguous buffers composing the memory.
+  size_t BufferCount() const { return buffer_count_; }
+
+  // Return the total byte size of the data buffer
+  size_t TotalByteSize() const { return total_byte_size_; }
+
+ protected:
+  Memory() : total_byte_size_(0), buffer_count_(0) {}
+  size_t total_byte_size_;
+  size_t buffer_count_;
+};
+
+//
+// MemoryReference
+//
+class MemoryReference : public Memory {
+ public:
+  // Create a read-only data buffer as a reference to other data buffer
+  MemoryReference();
+
+  //\see Memory::BufferAt()
+  const char* BufferAt(
+      size_t idx, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
+      int64_t* memory_type_id) const override;
+
+  const char* BufferAt(
+      size_t idx, BufferAttributes** buffer_attributes) override;
+
+  // Add a 'buffer' with 'byte_size' as part of this data buffer
+  // Return the index of the buffer
+  size_t AddBuffer(
+      const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id);
+
+  size_t AddBuffer(const char* buffer, BufferAttributes* buffer_attributes);
+
+  // Add a 'buffer' with 'byte_size' as part of this data buffer in the front
+  // Return the index of the buffer
+  size_t AddBufferFront(
+      const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id);
+
+ private:
+  struct Block {
+    Block(
+        const char* buffer, size_t byte_size,
+        TRITONSERVER_MemoryType memory_type, int64_t memory_type_id)
+        : buffer_(buffer), buffer_attributes_(BufferAttributes(
+                               byte_size, memory_type, memory_type_id, nullptr))
+    {
+    }
+
+    Block(const char* buffer, BufferAttributes* buffer_attributes)
+        : buffer_(buffer), buffer_attributes_(*buffer_attributes)
+    {
+    }
+    const char* buffer_;
+    BufferAttributes buffer_attributes_;
+  };
+  std::vector<Block> buffer_;
+};
+
+//
+// MutableMemory
+//
+class MutableMemory : public Memory {
+ public:
+  // Create a mutable data buffer referencing to other data buffer.
+  MutableMemory(
+      char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id);
+
+  virtual ~MutableMemory() {}
+
+  //\see Memory::BufferAt()
+  const char* BufferAt(
+      size_t idx, size_t* byte_size, TRITONSERVER_MemoryType* memory_type,
+      int64_t* memory_type_id) const override;
+
+  //\see Memory::BufferAt()
+  const char* BufferAt(
+      size_t idx, BufferAttributes** buffer_attributes) override;
+
+  // Return a pointer to the base address of the mutable buffer. If
+  // non-null 'memory_type' returns the memory type of the chunk of
+  // bytes. If non-null 'memory_type_id' returns the memory type id of
+  // the chunk of bytes.
+  char* MutableBuffer(
+      TRITONSERVER_MemoryType* memory_type = nullptr,
+      int64_t* memory_type_id = nullptr);
+
+  DISALLOW_COPY_AND_ASSIGN(MutableMemory);
+
+ protected:
+  MutableMemory() : Memory() {}
+
+  char* buffer_;
+  BufferAttributes buffer_attributes_;
+};
+
+//
+// AllocatedMemory
+//
+class AllocatedMemory : public MutableMemory {
+ public:
+  // Create a continuous data buffer with 'byte_size', 'memory_type' and
+  // 'memory_type_id'. Note that the buffer may be created on different memeory
+  // type and memory type id if the original request type and id can not be
+  // satisfied, thus the function caller should always check the actual memory
+  // type and memory type id before use.
+  AllocatedMemory(
+      size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_id);
+
+  ~AllocatedMemory() override;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/metric_family.cc
+++ b/3rdparty/core-r22.12/src/metric_family.cc
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifdef TRITON_ENABLE_METRICS
+
+#include "metric_family.h"
+#include "metrics.h"
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+//
+// Implementation for TRITONSERVER_MetricFamily.
+//
+MetricFamily::MetricFamily(
+    TRITONSERVER_MetricKind kind, const char* name, const char* description)
+{
+  auto registry = Metrics::GetRegistry();
+
+  switch (kind) {
+    case TRITONSERVER_METRIC_KIND_COUNTER:
+      family_ = reinterpret_cast<void*>(&prometheus::BuildCounter()
+                                             .Name(name)
+                                             .Help(description)
+                                             .Register(*registry));
+      break;
+    case TRITONSERVER_METRIC_KIND_GAUGE:
+      family_ = reinterpret_cast<void*>(&prometheus::BuildGauge()
+                                             .Name(name)
+                                             .Help(description)
+                                             .Register(*registry));
+      break;
+    default:
+      throw std::invalid_argument(
+          "Unsupported kind passed to MetricFamily constructor.");
+  }
+
+  kind_ = kind;
+}
+
+void*
+MetricFamily::Add(std::map<std::string, std::string> label_map, Metric* metric)
+{
+  void* prom_metric = nullptr;
+  switch (kind_) {
+    case TRITONSERVER_METRIC_KIND_COUNTER: {
+      auto counter_family_ptr =
+          reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
+      auto counter_ptr = &counter_family_ptr->Add(label_map);
+      prom_metric = reinterpret_cast<void*>(counter_ptr);
+      break;
+    }
+    case TRITONSERVER_METRIC_KIND_GAUGE: {
+      auto gauge_family_ptr =
+          reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
+      auto gauge_ptr = &gauge_family_ptr->Add(label_map);
+      prom_metric = reinterpret_cast<void*>(gauge_ptr);
+      break;
+    }
+    default:
+      throw std::invalid_argument(
+          "Unsupported family kind passed to Metric constructor.");
+  }
+
+  std::lock_guard<std::mutex> lk(metric_mtx_);
+  ++prom_metric_ref_cnt_[prom_metric];
+  child_metrics_.insert(metric);
+  return prom_metric;
+}
+
+void
+MetricFamily::Remove(void* prom_metric, Metric* metric)
+{
+  {
+    // Remove reference to dependent Metric object
+    std::lock_guard<std::mutex> lk(metric_mtx_);
+    child_metrics_.erase(metric);
+  }
+
+  if (prom_metric == nullptr) {
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lk(metric_mtx_);
+    const auto it = prom_metric_ref_cnt_.find(prom_metric);
+    if (it != prom_metric_ref_cnt_.end()) {
+      --it->second;
+      if (it->second == 0) {
+        prom_metric_ref_cnt_.erase(it);
+      } else {
+        // Done as it is not the last reference
+        return;
+      }
+    }
+  }
+
+  switch (kind_) {
+    case TRITONSERVER_METRIC_KIND_COUNTER: {
+      auto counter_family_ptr =
+          reinterpret_cast<prometheus::Family<prometheus::Counter>*>(family_);
+      auto counter_ptr = reinterpret_cast<prometheus::Counter*>(prom_metric);
+      counter_family_ptr->Remove(counter_ptr);
+      break;
+    }
+    case TRITONSERVER_METRIC_KIND_GAUGE: {
+      auto gauge_family_ptr =
+          reinterpret_cast<prometheus::Family<prometheus::Gauge>*>(family_);
+      auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(prom_metric);
+      gauge_family_ptr->Remove(gauge_ptr);
+      break;
+    }
+    default:
+      // Invalid kind should be caught in constructor
+      LOG_ERROR << "Unsupported kind in Metric destructor.";
+      break;
+  }
+}
+
+void
+MetricFamily::InvalidateReferences()
+{
+  std::lock_guard<std::mutex> lk(metric_mtx_);
+  for (auto& metric : child_metrics_) {
+    if (metric != nullptr) {
+      metric->Invalidate();
+    }
+  }
+  child_metrics_.clear();
+}
+
+MetricFamily::~MetricFamily()
+{
+  if (NumMetrics() > 0) {
+    LOG_WARNING << "MetricFamily was deleted before its child Metrics, this "
+                   "should not happen. Make sure to delete all child Metrics "
+                   "before deleting their MetricFamily.";
+  }
+  InvalidateReferences();
+  // DLIS-4072: Support for removing metric families from registry
+}
+
+//
+// Implementation for TRITONSERVER_Metric.
+//
+Metric::Metric(
+    TRITONSERVER_MetricFamily* family,
+    std::vector<const InferenceParameter*> labels)
+{
+  family_ = reinterpret_cast<MetricFamily*>(family);
+  kind_ = family_->Kind();
+
+  // Create map of labels from InferenceParameters
+  std::map<std::string, std::string> label_map;
+  for (const auto& param : labels) {
+    if (param->Type() != TRITONSERVER_PARAMETER_STRING) {
+      throw std::invalid_argument(
+          "Parameter [" + param->Name() +
+          "] must have a type of TRITONSERVER_PARAMETER_STRING to be "
+          "added as a label.");
+    }
+
+    label_map[param->Name()] =
+        std::string(reinterpret_cast<const char*>(param->ValuePointer()));
+  }
+
+  metric_ = family_->Add(label_map, this);
+}
+
+Metric::~Metric()
+{
+  if (family_ != nullptr) {
+    family_->Remove(metric_, this);
+  } else {
+    LOG_WARNING << "Corresponding MetricFamily was deleted before this Metric, "
+                   "this should not happen. Make sure to delete a Metric "
+                   "before deleting its MetricFamily.";
+  }
+  // Catch lifetime management / invalid reference issues
+  Invalidate();
+}
+
+void
+Metric::Invalidate()
+{
+  family_ = nullptr;
+  metric_ = nullptr;
+}
+
+TRITONSERVER_Error*
+Metric::Value(double* value)
+{
+  if (metric_ == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Could not get metric value. Metric has been invalidated.");
+  }
+
+  switch (kind_) {
+    case TRITONSERVER_METRIC_KIND_COUNTER: {
+      auto counter_ptr = reinterpret_cast<prometheus::Counter*>(metric_);
+      LOG_VERBOSE(1) << "SETTING COUNTER METRIC FROM: " << *value << " to "
+                     << counter_ptr->Value();
+      *value = counter_ptr->Value();
+      break;
+    }
+    case TRITONSERVER_METRIC_KIND_GAUGE: {
+      auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(metric_);
+      LOG_VERBOSE(1) << "SETTING GAUGE METRIC FROM: " << *value << " to "
+                     << gauge_ptr->Value();
+      *value = gauge_ptr->Value();
+      break;
+    }
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "Unsupported TRITONSERVER_MetricKind");
+  }
+
+  return nullptr;  // Success
+}
+
+TRITONSERVER_Error*
+Metric::Increment(double value)
+{
+  if (metric_ == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Could not increment metric value. Metric has been invalidated.");
+  }
+
+  switch (kind_) {
+    case TRITONSERVER_METRIC_KIND_COUNTER: {
+      if (value < 0.0) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INVALID_ARG,
+            "TRITONSERVER_METRIC_KIND_COUNTER can only be incremented "
+            "monotonically by non-negative values.");
+      }
+
+      auto counter_ptr = reinterpret_cast<prometheus::Counter*>(metric_);
+      counter_ptr->Increment(value);
+      break;
+    }
+    case TRITONSERVER_METRIC_KIND_GAUGE: {
+      auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(metric_);
+      // Gauge::Increment works for both positive and negative values as of
+      // prometheus-cpp v1.0 but for now on v0.7 we defer call to
+      // Increment/Decrement based on the sign of value
+      // https://github.com/jupp0r/prometheus-cpp/blob/master/core/src/gauge.cc
+      if (value < 0.0) {
+        gauge_ptr->Decrement(-1.0 * value);
+      } else {
+        gauge_ptr->Increment(value);
+      }
+      break;
+    }
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "Unsupported TRITONSERVER_MetricKind");
+  }
+
+  return nullptr;  // Success
+}
+
+TRITONSERVER_Error*
+Metric::Set(double value)
+{
+  if (metric_ == nullptr) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        "Could not set metric value. Metric has been invalidated.");
+  }
+
+  switch (kind_) {
+    case TRITONSERVER_METRIC_KIND_COUNTER: {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "TRITONSERVER_METRIC_KIND_COUNTER does not support Set");
+    }
+    case TRITONSERVER_METRIC_KIND_GAUGE: {
+      auto gauge_ptr = reinterpret_cast<prometheus::Gauge*>(metric_);
+      gauge_ptr->Set(value);
+      break;
+    }
+    default:
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_UNSUPPORTED,
+          "Unsupported TRITONSERVER_MetricKind");
+  }
+
+  return nullptr;  // Success
+}
+
+}}  // namespace triton::core
+
+#endif  // TRITON_ENABLE_METRICS
--- a/3rdparty/core-r22.12/src/metric_family.h
+++ b/3rdparty/core-r22.12/src/metric_family.h
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#ifdef TRITON_ENABLE_METRICS
+
+#include <mutex>
+#include <set>
+#include <unordered_map>
+
+#include "infer_parameter.h"
+#include "prometheus/registry.h"
+#include "tritonserver_apis.h"
+
+namespace triton { namespace core {
+
+//
+// Implementation for TRITONSERVER_MetricFamily.
+//
+class Metric;
+class MetricFamily {
+ public:
+  MetricFamily(
+      TRITONSERVER_MetricKind kind, const char* name, const char* description);
+  ~MetricFamily();
+
+  void* Family() const { return family_; }
+  TRITONSERVER_MetricKind Kind() const { return kind_; }
+
+  void* Add(std::map<std::string, std::string> label_map, Metric* metric);
+  void Remove(void* prom_metric, Metric* metric);
+
+  int NumMetrics()
+  {
+    std::lock_guard<std::mutex> lk(metric_mtx_);
+    return child_metrics_.size();
+  }
+
+ private:
+  // If a MetricFamily is deleted before its dependent Metric, we want to
+  // invalidate the reference so we don't access invalid memory.
+  void InvalidateReferences();
+
+  void* family_;
+  TRITONSERVER_MetricKind kind_;
+  // Synchronize access of related metric objects
+  std::mutex metric_mtx_;
+  // Prometheus returns the existing metric pointer if the metric with the same
+  // set of labels are requested, as a result, different Metric objects may
+  // refer to the same prometheus metric. So we must track the reference count
+  // of the metric and request prometheus to remove it only when all references
+  // are released.
+  std::unordered_map<void*, size_t> prom_metric_ref_cnt_;
+  // Maintain references to metrics created from this metric family to
+  // invalidate their references if a family is deleted before its metric
+  std::set<Metric*> child_metrics_;
+};
+
+//
+// Implementation for TRITONSERVER_Metric.
+//
+class Metric {
+ public:
+  Metric(
+      TRITONSERVER_MetricFamily* family,
+      std::vector<const InferenceParameter*> labels);
+  ~Metric();
+
+  MetricFamily* Family() const { return family_; }
+  TRITONSERVER_MetricKind Kind() const { return kind_; }
+
+  TRITONSERVER_Error* Value(double* value);
+  TRITONSERVER_Error* Increment(double value);
+  TRITONSERVER_Error* Set(double value);
+
+  // If a MetricFamily is deleted before its dependent Metric, we want to
+  // invalidate the references so we don't access invalid memory.
+  void Invalidate();
+
+ private:
+  void* metric_;
+  MetricFamily* family_;
+  TRITONSERVER_MetricKind kind_;
+};
+
+}}  // namespace triton::core
+
+#endif  // TRITON_ENABLE_METRICS
--- a/3rdparty/core-r22.12/src/metric_model_reporter.cc
+++ b/3rdparty/core-r22.12/src/metric_model_reporter.cc
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "metric_model_reporter.h"
+
+#ifdef TRITON_ENABLE_METRICS
+
+#include "constants.h"
+#include "metrics.h"
+
+namespace triton { namespace core {
+
+Status
+MetricModelReporter::Create(
+    const std::string& model_name, const int64_t model_version,
+    const int device, const triton::common::MetricTagsMap& model_tags,
+    std::shared_ptr<MetricModelReporter>* metric_model_reporter)
+{
+  static std::mutex mtx;
+  static std::unordered_map<size_t, std::weak_ptr<MetricModelReporter>>
+      reporter_map;
+
+  std::map<std::string, std::string> labels;
+  GetMetricLabels(&labels, model_name, model_version, device, model_tags);
+  auto hash_labels = Metrics::HashLabels(labels);
+
+  std::lock_guard<std::mutex> lock(mtx);
+
+  const auto& itr = reporter_map.find(hash_labels);
+  if (itr != reporter_map.end()) {
+    // Found in map. If the weak_ptr is still valid that means that
+    // there are other models using the reporter and we just reuse that
+    // same reporter. If the weak_ptr is not valid then we need to remove
+    // the weak_ptr from the map and create the reporter again.
+    *metric_model_reporter = itr->second.lock();
+    if (*metric_model_reporter != nullptr) {
+      return Status::Success;
+    }
+
+    reporter_map.erase(itr);
+  }
+
+  metric_model_reporter->reset(
+      new MetricModelReporter(model_name, model_version, device, model_tags));
+  reporter_map.insert({hash_labels, *metric_model_reporter});
+  return Status::Success;
+}
+
+MetricModelReporter::MetricModelReporter(
+    const std::string& model_name, const int64_t model_version,
+    const int device, const triton::common::MetricTagsMap& model_tags)
+{
+  std::map<std::string, std::string> labels;
+  GetMetricLabels(&labels, model_name, model_version, device, model_tags);
+
+  metric_inf_success_ =
+      CreateCounterMetric(Metrics::FamilyInferenceSuccess(), labels);
+  metric_inf_failure_ =
+      CreateCounterMetric(Metrics::FamilyInferenceFailure(), labels);
+  metric_inf_count_ =
+      CreateCounterMetric(Metrics::FamilyInferenceCount(), labels);
+  metric_inf_exec_count_ =
+      CreateCounterMetric(Metrics::FamilyInferenceExecutionCount(), labels);
+  metric_inf_request_duration_us_ =
+      CreateCounterMetric(Metrics::FamilyInferenceRequestDuration(), labels);
+  metric_inf_queue_duration_us_ =
+      CreateCounterMetric(Metrics::FamilyInferenceQueueDuration(), labels);
+  metric_inf_compute_input_duration_us_ = CreateCounterMetric(
+      Metrics::FamilyInferenceComputeInputDuration(), labels);
+  metric_inf_compute_infer_duration_us_ = CreateCounterMetric(
+      Metrics::FamilyInferenceComputeInferDuration(), labels);
+  metric_inf_compute_output_duration_us_ = CreateCounterMetric(
+      Metrics::FamilyInferenceComputeOutputDuration(), labels);
+  metric_cache_hit_count_ =
+      CreateCounterMetric(Metrics::FamilyCacheHitCount(), labels);
+  metric_cache_hit_lookup_duration_us_ =
+      CreateCounterMetric(Metrics::FamilyCacheHitLookupDuration(), labels);
+  metric_cache_miss_count_ =
+      CreateCounterMetric(Metrics::FamilyCacheMissCount(), labels);
+  metric_cache_miss_lookup_duration_us_ =
+      CreateCounterMetric(Metrics::FamilyCacheMissLookupDuration(), labels);
+  metric_cache_miss_insertion_duration_us_ =
+      CreateCounterMetric(Metrics::FamilyCacheMissInsertionDuration(), labels);
+}
+
+MetricModelReporter::~MetricModelReporter()
+{
+  Metrics::FamilyInferenceSuccess().Remove(metric_inf_success_);
+  Metrics::FamilyInferenceFailure().Remove(metric_inf_failure_);
+  Metrics::FamilyInferenceCount().Remove(metric_inf_count_);
+  Metrics::FamilyInferenceExecutionCount().Remove(metric_inf_exec_count_);
+  Metrics::FamilyInferenceRequestDuration().Remove(
+      metric_inf_request_duration_us_);
+  Metrics::FamilyInferenceQueueDuration().Remove(metric_inf_queue_duration_us_);
+  Metrics::FamilyInferenceComputeInputDuration().Remove(
+      metric_inf_compute_input_duration_us_);
+  Metrics::FamilyInferenceComputeInferDuration().Remove(
+      metric_inf_compute_infer_duration_us_);
+  Metrics::FamilyInferenceComputeOutputDuration().Remove(
+      metric_inf_compute_output_duration_us_);
+  Metrics::FamilyCacheHitCount().Remove(metric_cache_hit_count_);
+  Metrics::FamilyCacheHitLookupDuration().Remove(
+      metric_cache_hit_lookup_duration_us_);
+  Metrics::FamilyCacheMissCount().Remove(metric_cache_miss_count_);
+  Metrics::FamilyCacheMissInsertionDuration().Remove(
+      metric_cache_miss_insertion_duration_us_);
+}
+
+void
+MetricModelReporter::GetMetricLabels(
+    std::map<std::string, std::string>* labels, const std::string& model_name,
+    const int64_t model_version, const int device,
+    const triton::common::MetricTagsMap& model_tags)
+{
+  labels->insert(std::map<std::string, std::string>::value_type(
+      std::string(kMetricsLabelModelName), model_name));
+  labels->insert(std::map<std::string, std::string>::value_type(
+      std::string(kMetricsLabelModelVersion), std::to_string(model_version)));
+  for (const auto& tag : model_tags) {
+    labels->insert(std::map<std::string, std::string>::value_type(
+        "_" + tag.first, tag.second));
+  }
+
+  // 'device' can be < 0 to indicate that the GPU is not known. In
+  // that case use a metric that doesn't have the gpu_uuid label.
+  if (device >= 0) {
+    std::string uuid;
+    if (Metrics::UUIDForCudaDevice(device, &uuid)) {
+      labels->insert(std::map<std::string, std::string>::value_type(
+          std::string(kMetricsLabelGpuUuid), uuid));
+    }
+  }
+}
+
+prometheus::Counter*
+MetricModelReporter::CreateCounterMetric(
+    prometheus::Family<prometheus::Counter>& family,
+    const std::map<std::string, std::string>& labels)
+{
+  return &family.Add(labels);
+}
+
+}}  // namespace triton::core
+
+#endif  // TRITON_ENABLE_METRICS
--- a/3rdparty/core-r22.12/src/metric_model_reporter.h
+++ b/3rdparty/core-r22.12/src/metric_model_reporter.h
+// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "status.h"
+#include "triton/common/model_config.h"
+
+#ifdef TRITON_ENABLE_METRICS
+#include "prometheus/registry.h"
+#endif  // TRITON_ENABLE_METRICS
+
+namespace triton { namespace core {
+
+//
+// Interface for a metric reporter for a given version of a model.
+//
+class MetricModelReporter {
+ public:
+#ifdef TRITON_ENABLE_METRICS
+  static Status Create(
+      const std::string& model_name, const int64_t model_version,
+      const int device, const triton::common::MetricTagsMap& model_tags,
+      std::shared_ptr<MetricModelReporter>* metric_model_reporter);
+
+  ~MetricModelReporter();
+
+  // Get a metric for the given model, version and GPU index.
+  prometheus::Counter& MetricInferenceSuccess() const
+  {
+    return *metric_inf_success_;
+  }
+  prometheus::Counter& MetricInferenceFailure() const
+  {
+    return *metric_inf_failure_;
+  }
+  prometheus::Counter& MetricInferenceCount() const
+  {
+    return *metric_inf_count_;
+  }
+  prometheus::Counter& MetricInferenceExecutionCount() const
+  {
+    return *metric_inf_exec_count_;
+  }
+  prometheus::Counter& MetricInferenceRequestDuration() const
+  {
+    return *metric_inf_request_duration_us_;
+  }
+  prometheus::Counter& MetricInferenceQueueDuration() const
+  {
+    return *metric_inf_queue_duration_us_;
+  }
+  prometheus::Counter& MetricInferenceComputeInputDuration() const
+  {
+    return *metric_inf_compute_input_duration_us_;
+  }
+  prometheus::Counter& MetricInferenceComputeInferDuration() const
+  {
+    return *metric_inf_compute_infer_duration_us_;
+  }
+  prometheus::Counter& MetricInferenceComputeOutputDuration() const
+  {
+    return *metric_inf_compute_output_duration_us_;
+  }
+  prometheus::Counter& MetricCacheHitCount() const
+  {
+    return *metric_cache_hit_count_;
+  }
+  prometheus::Counter& MetricCacheHitLookupDuration() const
+  {
+    return *metric_cache_hit_lookup_duration_us_;
+  }
+  prometheus::Counter& MetricCacheMissCount() const
+  {
+    return *metric_cache_miss_count_;
+  }
+  prometheus::Counter& MetricCacheMissLookupDuration() const
+  {
+    return *metric_cache_miss_lookup_duration_us_;
+  }
+  prometheus::Counter& MetricCacheMissInsertionDuration() const
+  {
+    return *metric_cache_miss_insertion_duration_us_;
+  }
+
+ private:
+  MetricModelReporter(
+      const std::string& model_name, const int64_t model_version,
+      const int device, const triton::common::MetricTagsMap& model_tags);
+
+  static void GetMetricLabels(
+      std::map<std::string, std::string>* labels, const std::string& model_name,
+      const int64_t model_version, const int device,
+      const triton::common::MetricTagsMap& model_tags);
+  prometheus::Counter* CreateCounterMetric(
+      prometheus::Family<prometheus::Counter>& family,
+      const std::map<std::string, std::string>& labels);
+
+  prometheus::Counter* metric_inf_success_;
+  prometheus::Counter* metric_inf_failure_;
+  prometheus::Counter* metric_inf_count_;
+  prometheus::Counter* metric_inf_exec_count_;
+  prometheus::Counter* metric_inf_request_duration_us_;
+  prometheus::Counter* metric_inf_queue_duration_us_;
+  prometheus::Counter* metric_inf_compute_input_duration_us_;
+  prometheus::Counter* metric_inf_compute_infer_duration_us_;
+  prometheus::Counter* metric_inf_compute_output_duration_us_;
+  prometheus::Counter* metric_cache_hit_count_;
+  prometheus::Counter* metric_cache_hit_lookup_duration_us_;
+  prometheus::Counter* metric_cache_miss_count_;
+  prometheus::Counter* metric_cache_miss_lookup_duration_us_;
+  prometheus::Counter* metric_cache_miss_insertion_duration_us_;
+#endif  // TRITON_ENABLE_METRICS
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/metrics.cc
+++ b/3rdparty/core-r22.12/src/metrics.cc
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#ifdef TRITON_ENABLE_METRICS
+
+#include "metrics.h"
+
+#include <thread>
+#include "constants.h"
+#include "prometheus/detail/utils.h"
+#include "triton/common/logging.h"
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+#include <cuda_runtime_api.h>
+#include <dcgm_agent.h>
+#include <cstring>
+#include <set>
+#include <string>
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+namespace triton { namespace core {
+
+Metrics::Metrics()
+    : registry_(std::make_shared<prometheus::Registry>()),
+      serializer_(new prometheus::TextSerializer()),
+      inf_success_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_request_success")
+              .Help("Number of successful inference requests, all batch sizes")
+              .Register(*registry_)),
+      inf_failure_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_request_failure")
+              .Help("Number of failed inference requests, all batch sizes")
+              .Register(*registry_)),
+      inf_count_family_(prometheus::BuildCounter()
+                            .Name("nv_inference_count")
+                            .Help("Number of inferences performed (does not "
+                                  "include cached requests)")
+                            .Register(*registry_)),
+      inf_count_exec_family_(prometheus::BuildCounter()
+                                 .Name("nv_inference_exec_count")
+                                 .Help("Number of model executions performed "
+                                       "(does not include cached requests)")
+                                 .Register(*registry_)),
+      inf_request_duration_us_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_request_duration_us")
+              .Help("Cumulative inference request duration in microseconds "
+                    "(includes cached requests)")
+              .Register(*registry_)),
+      inf_queue_duration_us_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_queue_duration_us")
+              .Help("Cumulative inference queuing duration in microseconds "
+                    "(includes cached requests)")
+              .Register(*registry_)),
+      inf_compute_input_duration_us_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_compute_input_duration_us")
+              .Help("Cumulative compute input duration in microseconds (does "
+                    "not include cached requests)")
+              .Register(*registry_)),
+      inf_compute_infer_duration_us_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_compute_infer_duration_us")
+              .Help("Cumulative compute inference duration in microseconds "
+                    "(does not include cached requests)")
+              .Register(*registry_)),
+      inf_compute_output_duration_us_family_(
+          prometheus::BuildCounter()
+              .Name("nv_inference_compute_output_duration_us")
+              .Help("Cumulative inference compute output duration in "
+                    "microseconds (does not include cached requests)")
+              .Register(*registry_)),
+      cache_num_entries_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_num_entries")
+              .Help("Number of responses stored in response cache")
+              .Register(*registry_)),
+      cache_num_lookups_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_num_lookups")
+              .Help("Number of cache lookups in response cache")
+              .Register(*registry_)),
+      cache_num_hits_family_(prometheus::BuildGauge()
+                                 .Name("nv_cache_num_hits")
+                                 .Help("Number of cache hits in response cache")
+                                 .Register(*registry_)),
+      cache_num_misses_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_num_misses")
+              .Help("Number of cache misses in response cache")
+              .Register(*registry_)),
+      cache_num_evictions_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_num_evictions")
+              .Help("Number of cache evictions in response cache")
+              .Register(*registry_)),
+      cache_lookup_duration_us_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_lookup_duration")
+              .Help(
+                  "Total cache lookup duration (hit and miss), in microseconds")
+              .Register(*registry_)),
+      cache_insertion_duration_us_family_(
+          prometheus::BuildGauge()
+              .Name("nv_cache_insertion_duration")
+              .Help("Total cache insertion duration, in microseconds")
+              .Register(*registry_)),
+      cache_util_family_(prometheus::BuildGauge()
+                             .Name("nv_cache_util")
+                             .Help("Cache utilization [0.0 - 1.0]")
+                             .Register(*registry_)),
+      // Per-model cache metric families
+      cache_num_hits_model_family_(prometheus::BuildCounter()
+                                       .Name("nv_cache_num_hits_per_model")
+                                       .Help("Number of cache hits per model")
+                                       .Register(*registry_)),
+      cache_hit_lookup_duration_us_model_family_(
+          prometheus::BuildCounter()
+              .Name("nv_cache_hit_lookup_duration_per_model")
+              .Help(
+                  "Total cache hit lookup duration per model, in microseconds")
+              .Register(*registry_)),
+      cache_num_misses_model_family_(
+          prometheus::BuildCounter()
+              .Name("nv_cache_num_misses_per_model")
+              .Help("Number of cache misses per model")
+              .Register(*registry_)),
+      cache_miss_lookup_duration_us_model_family_(
+          prometheus::BuildCounter()
+              .Name("nv_cache_miss_lookup_duration_per_model")
+              .Help(
+                  "Total cache miss lookup duration per model, in microseconds")
+              .Register(*registry_)),
+      cache_miss_insertion_duration_us_model_family_(
+          prometheus::BuildCounter()
+              .Name("nv_cache_miss_insertion_duration_per_model")
+              .Help("Total cache miss insertion duration per model, in "
+                    "microseconds")
+              .Register(*registry_)),
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+      gpu_utilization_family_(prometheus::BuildGauge()
+                                  .Name("nv_gpu_utilization")
+                                  .Help("GPU utilization rate [0.0 - 1.0)")
+                                  .Register(*registry_)),
+      gpu_memory_total_family_(prometheus::BuildGauge()
+                                   .Name("nv_gpu_memory_total_bytes")
+                                   .Help("GPU total memory, in bytes")
+                                   .Register(*registry_)),
+      gpu_memory_used_family_(prometheus::BuildGauge()
+                                  .Name("nv_gpu_memory_used_bytes")
+                                  .Help("GPU used memory, in bytes")
+                                  .Register(*registry_)),
+      gpu_power_usage_family_(prometheus::BuildGauge()
+                                  .Name("nv_gpu_power_usage")
+                                  .Help("GPU power usage in watts")
+                                  .Register(*registry_)),
+      gpu_power_limit_family_(prometheus::BuildGauge()
+                                  .Name("nv_gpu_power_limit")
+                                  .Help("GPU power management limit in watts")
+                                  .Register(*registry_)),
+      gpu_energy_consumption_family_(
+          prometheus::BuildCounter()
+              .Name("nv_energy_consumption")
+              .Help("GPU energy consumption in joules since the Triton Server "
+                    "started")
+              .Register(*registry_)),
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+#ifdef TRITON_ENABLE_METRICS_CPU
+      cpu_utilization_family_(prometheus::BuildGauge()
+                                  .Name("nv_cpu_utilization")
+                                  .Help("CPU utilization rate [0.0 - 1.0]")
+                                  .Register(*registry_)),
+      cpu_memory_total_family_(prometheus::BuildGauge()
+                                   .Name("nv_cpu_memory_total_bytes")
+                                   .Help("CPU total memory (RAM), in bytes")
+                                   .Register(*registry_)),
+      cpu_memory_used_family_(prometheus::BuildGauge()
+                                  .Name("nv_cpu_memory_used_bytes")
+                                  .Help("CPU used memory (RAM), in bytes")
+                                  .Register(*registry_)),
+#endif  // TRITON_ENABLE_METRICS_CPU
+
+      metrics_enabled_(false), gpu_metrics_enabled_(false),
+      cpu_metrics_enabled_(false), cache_metrics_enabled_(false),
+      metrics_interval_ms_(2000)
+{
+}
+
+static prometheus::detail::LabelHasher label_hasher_;
+
+size_t
+Metrics::HashLabels(const std::map<std::string, std::string>& labels)
+{
+  return label_hasher_(labels);
+}
+
+Metrics::~Metrics()
+{
+  // Signal the cache thread to exit and then wait for it...
+  if (poll_thread_ != nullptr) {
+    poll_thread_exit_.store(true);
+    poll_thread_->join();
+#ifdef TRITON_ENABLE_METRICS_GPU
+    if (dcgm_metadata_.dcgm_initialized_) {
+      dcgmReturn_t derr;
+      // Group destroy will return an error if groupId invalid or dcgm not
+      // initialized or configured correctly
+      derr = dcgmGroupDestroy(
+          dcgm_metadata_.dcgm_handle_, dcgm_metadata_.groupId_);
+      if (derr != DCGM_ST_OK) {
+        LOG_WARNING << "Unable to destroy DCGM group: " << errorString(derr);
+      }
+
+      // Stop and shutdown DCGM
+      if (dcgm_metadata_.standalone_) {
+        derr = dcgmDisconnect(dcgm_metadata_.dcgm_handle_);
+      } else {
+        derr = dcgmStopEmbedded(dcgm_metadata_.dcgm_handle_);
+      }
+      if (derr != DCGM_ST_OK) {
+        LOG_WARNING << "Unable to stop DCGM: " << errorString(derr);
+      }
+      derr = dcgmShutdown();
+      if (derr != DCGM_ST_OK) {
+        LOG_WARNING << "Unable to shutdown DCGM: " << errorString(derr);
+      }
+    }
+#endif  // TRITON_ENABLE_METRICS_GPU
+  }
+}
+
+bool
+Metrics::Enabled()
+{
+  auto singleton = GetSingleton();
+  return singleton->metrics_enabled_;
+}
+
+void
+Metrics::EnableMetrics()
+{
+  auto singleton = GetSingleton();
+  singleton->metrics_enabled_ = true;
+}
+
+void
+Metrics::EnableCacheMetrics(
+    std::shared_ptr<RequestResponseCache> response_cache)
+{
+  auto singleton = GetSingleton();
+  // Ensure thread-safe enabling of Cache Metrics
+  std::lock_guard<std::mutex> lock(singleton->metrics_enabling_);
+  if (singleton->cache_metrics_enabled_) {
+    return;
+  }
+
+  singleton->InitializeCacheMetrics(response_cache);
+  singleton->cache_metrics_enabled_ = true;
+}
+
+void
+Metrics::EnableGPUMetrics()
+{
+  auto singleton = GetSingleton();
+  // Ensure thread-safe enabling of GPU Metrics
+  std::lock_guard<std::mutex> lock(singleton->metrics_enabling_);
+  if (singleton->gpu_metrics_enabled_) {
+    return;
+  }
+
+  if (std::getenv("TRITON_SERVER_CPU_ONLY") == nullptr) {
+    singleton->InitializeDcgmMetrics();
+  }
+
+  singleton->gpu_metrics_enabled_ = true;
+}
+
+void
+Metrics::EnableCpuMetrics()
+{
+  auto singleton = GetSingleton();
+  // Ensure thread-safe enabling of CPU Metrics
+  std::lock_guard<std::mutex> lock(singleton->metrics_enabling_);
+  if (singleton->cpu_metrics_enabled_) {
+    return;
+  }
+
+  singleton->InitializeCpuMetrics();
+  singleton->cpu_metrics_enabled_ = true;
+}
+
+void
+Metrics::SetMetricsInterval(uint64_t metrics_interval_ms)
+{
+  auto singleton = GetSingleton();
+  singleton->metrics_interval_ms_ = metrics_interval_ms;
+}
+
+void
+Metrics::StartPollingThreadSingleton(
+    std::shared_ptr<RequestResponseCache> response_cache)
+{
+  auto singleton = GetSingleton();
+
+  // Ensure thread-safe start of polling thread
+  std::lock_guard<std::mutex> lock(singleton->poll_thread_starting_);
+  if (singleton->poll_thread_started_) {
+    return;
+  }
+
+  // Start thread for polling cache/dcgm metrics
+  singleton->StartPollingThread(response_cache);
+
+  // Toggle flag so this function is only executed once
+  singleton->poll_thread_started_ = true;
+}
+
+bool
+Metrics::StartPollingThread(
+    std::shared_ptr<RequestResponseCache> response_cache)
+{
+  // Nothing to poll if no polling metrics enabled, don't spawn a thread
+  if (!cache_metrics_enabled_ && !gpu_metrics_enabled_ &&
+      !cpu_metrics_enabled_) {
+    LOG_WARNING << "No polling metrics (CPU, GPU, Cache) are enabled. Will not "
+                   "poll for them.";
+    return false;
+  }
+  poll_thread_exit_.store(false);
+
+  // Start a separate thread for polling metrics at specified interval
+  poll_thread_.reset(new std::thread([this, response_cache] {
+    // Thread will update metrics indefinitely until exit flag set
+    while (!poll_thread_exit_.load()) {
+      // Sleep for metric interval
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(metrics_interval_ms_ / 2));
+
+      // Poll Response Cache metrics
+      if (cache_metrics_enabled_ && response_cache != nullptr) {
+        PollCacheMetrics(response_cache);
+      }
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+      // Poll DCGM GPU metrics
+      if (gpu_metrics_enabled_ &&
+          dcgm_metadata_.available_cuda_gpu_ids_.size() > 0) {
+        PollDcgmMetrics();
+      }
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+#ifdef TRITON_ENABLE_METRICS_CPU
+      if (cpu_metrics_enabled_) {
+        PollCpuMetrics();
+      }
+#endif  // TRITON_ENABLE_METRICS_CPU
+    }
+  }));
+
+  return true;
+}
+
+bool
+Metrics::PollCacheMetrics(std::shared_ptr<RequestResponseCache> response_cache)
+{
+  if (response_cache == nullptr) {
+    LOG_WARNING << "error polling cache metrics, cache metrics will not be "
+                << "available: cache was nullptr";
+    return false;
+  }
+
+  // Update global cache metrics
+  cache_num_entries_global_->Set(response_cache->NumEntries());
+  cache_num_lookups_global_->Set(response_cache->NumLookups());
+  cache_num_hits_global_->Set(response_cache->NumHits());
+  cache_num_misses_global_->Set(response_cache->NumMisses());
+  cache_num_evictions_global_->Set(response_cache->NumEvictions());
+  cache_lookup_duration_us_global_->Set(
+      response_cache->TotalLookupLatencyNs() / 1000);
+  cache_insertion_duration_us_global_->Set(
+      response_cache->TotalInsertionLatencyNs() / 1000);
+  cache_util_global_->Set(response_cache->TotalUtilization());
+  return true;
+}
+
+#ifdef TRITON_ENABLE_METRICS_CPU
+Status
+Metrics::ParseCpuInfo(CpuInfo& info)
+{
+#ifdef _WIN32
+  return Status(
+      Status::Code::INTERNAL, "CPU metrics not supported on Windows.");
+#else
+  std::ifstream ifs("/proc/stat");
+  if (!ifs.good()) {
+    return Status(Status::Code::INTERNAL, "Failed to open /proc/stat.");
+  }
+
+  std::string line;
+  // Verify first line is aggregate cpu line
+  std::getline(ifs, line);
+  if (line.rfind("cpu ", 0) == std::string::npos) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Failed to find aggregate CPU info in /proc/stat.");
+  }
+
+  std::string _;
+  std::istringstream iss(line);
+  // Use _ to skip "cpu" at start of line
+  if (!(iss >> _ >> info)) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Failed to parse aggregate CPU info in /proc/stat.");
+  }
+  return Status::Success;
+#endif  // OS
+}
+
+Status
+Metrics::ParseMemInfo(MemInfo& info)
+{
+#ifdef _WIN32
+  return Status(
+      Status::Code::INTERNAL, "Memory metrics not supported on Windows.");
+#else
+  std::ifstream ifs("/proc/meminfo");
+  if (!ifs.good()) {
+    return Status(Status::Code::INTERNAL, "Failed to open /proc/meminfo.");
+  }
+
+  std::string line;
+  constexpr uint64_t KB = 1024;
+  while (std::getline(ifs, line)) {
+    std::istringstream iss(line);
+    std::string name;
+    uint64_t value = 0;
+    if (iss >> name >> value) {
+      name.pop_back();
+      info[name] = value * KB;
+    } else {
+      return Status(
+          Status::Code::INTERNAL, "Encountered error parsing /proc/meminfo.");
+    }
+  }
+
+  if (info.find("MemTotal") == info.end() ||
+      info.find("MemAvailable") == info.end()) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Failed to find desired values in /proc/meminfo.");
+  }
+
+  if (info["MemAvailable"] > info["MemTotal"]) {
+    return Status(
+        Status::Code::INTERNAL,
+        "Available bytes shouldn't be greater than Total bytes");
+  }
+
+  // "Used" memory can be defined in many different ways. While many
+  // older applications consider "used = total - (free + cached)", a more
+  // accurate measure of available memory "MemAvailable" was added,
+  // so we choose "used = total - available" for a more accurate measure.
+  // This may change in the future if not sufficient for most use cases.
+  // See https://stackoverflow.com/a/35019697.
+  info["MemUsed"] = info["MemTotal"] - info["MemAvailable"];
+
+  return Status::Success;
+#endif  // OS
+}
+
+double
+Metrics::CpuUtilization(const CpuInfo& info_new, const CpuInfo& info_old)
+{
+  // Account for overflow
+  const auto wrap_sub = [](uint64_t a, uint64_t b) {
+    return (a > b) ? (a - b) : 0;
+  };
+  uint64_t util_diff = wrap_sub(info_new.user, info_old.user) +
+                       wrap_sub(info_new.nice, info_old.nice) +
+                       wrap_sub(info_new.system, info_old.system) +
+                       wrap_sub(info_new.irq, info_old.irq) +
+                       wrap_sub(info_new.softirq, info_old.softirq) +
+                       wrap_sub(info_new.steal, info_old.steal);
+  uint64_t idle_diff = wrap_sub(info_new.idle, info_old.idle) +
+                       wrap_sub(info_new.iowait, info_old.iowait);
+  double util_ratio = static_cast<double>(util_diff) / (util_diff + idle_diff);
+  return util_ratio;
+}
+#endif  // TRITON_ENABLE_METRICS_CPU
+
+bool
+Metrics::PollCpuMetrics()
+{
+#ifndef TRITON_ENABLE_METRICS_CPU
+  return false;
+#else
+  // CPU Utilization
+  double cpu_util = 0.0;
+  auto cpu_info = CpuInfo();
+  auto status = ParseCpuInfo(cpu_info);
+  if (status.IsOk()) {
+    cpu_util = CpuUtilization(cpu_info, last_cpu_info_);
+    last_cpu_info_ = cpu_info;
+  }
+  cpu_utilization_->Set(cpu_util);  // [0.0, 1.0]
+
+  // RAM / Memory
+  double mem_total_bytes = 0.0;
+  double mem_used_bytes = 0.0;
+  auto mem_info = MemInfo();
+  status = ParseMemInfo(mem_info);
+  if (status.IsOk()) {
+    // MemTotal will usually not change over time, but if something
+    // goes wrong when querying memory, we can reflect that by updating.
+    mem_total_bytes = mem_info["MemTotal"];
+    mem_used_bytes = mem_info["MemUsed"];
+  }
+
+  cpu_memory_total_->Set(mem_total_bytes);
+  cpu_memory_used_->Set(mem_used_bytes);
+
+  return true;
+#endif  // TRITON_ENABLE_METRICS_CPU
+}
+
+bool
+Metrics::PollDcgmMetrics()
+{
+#ifndef TRITON_ENABLE_METRICS_GPU
+  return false;
+#else
+
+  if (dcgm_metadata_.available_cuda_gpu_ids_.size() == 0) {
+    LOG_WARNING << "error polling GPU metrics, GPU metrics will not be "
+                << "available: no available gpus to poll";
+    return false;
+  }
+
+  dcgmUpdateAllFields(dcgm_metadata_.dcgm_handle_, 1 /* wait for update*/);
+  for (unsigned int didx = 0;
+       didx < dcgm_metadata_.available_cuda_gpu_ids_.size(); ++didx) {
+    uint32_t cuda_id = dcgm_metadata_.available_cuda_gpu_ids_[didx];
+    if (dcgm_metadata_.cuda_ids_to_dcgm_ids_.count(cuda_id) <= 0) {
+      LOG_WARNING << "Cannot find DCGM id for CUDA id " << cuda_id;
+      continue;
+    }
+    uint32_t dcgm_id = dcgm_metadata_.cuda_ids_to_dcgm_ids_.at(cuda_id);
+    dcgmFieldValue_v1 field_values[dcgm_metadata_.field_count_];
+    dcgmReturn_t dcgmerr = dcgmGetLatestValuesForFields(
+        dcgm_metadata_.dcgm_handle_, dcgm_id, dcgm_metadata_.fields_.data(),
+        dcgm_metadata_.field_count_, field_values);
+
+    if (dcgmerr != DCGM_ST_OK) {
+      dcgm_metadata_.power_limit_fail_cnt_[didx]++;
+      dcgm_metadata_.power_usage_fail_cnt_[didx]++;
+      dcgm_metadata_.energy_fail_cnt_[didx]++;
+      dcgm_metadata_.util_fail_cnt_[didx]++;
+      dcgm_metadata_.mem_fail_cnt_[didx]++;
+      LOG_WARNING << "Unable to get field values for GPU ID " << cuda_id << ": "
+                  << errorString(dcgmerr);
+    } else {
+      // Power limit
+      if (dcgm_metadata_.power_limit_fail_cnt_[didx] <
+          dcgm_metadata_.fail_threshold_) {
+        double power_limit = field_values[0].value.dbl;
+        if ((field_values[0].status == DCGM_ST_OK) &&
+            (!DCGM_FP64_IS_BLANK(power_limit))) {
+          dcgm_metadata_.power_limit_fail_cnt_[didx] = 0;
+        } else {
+          dcgm_metadata_.power_limit_fail_cnt_[didx]++;
+          power_limit = 0;
+          dcgmReturn_t status = dcgmReturn_t(field_values[0].status);
+          LOG_WARNING << "Unable to get power limit for GPU " << cuda_id
+                      << ". Status:" << errorString(status)
+                      << ", value:" << dcgmValueToErrorMessage(power_limit);
+        }
+        gpu_power_limit_[didx]->Set(power_limit);
+      }
+
+      // Power usage
+      if (dcgm_metadata_.power_usage_fail_cnt_[didx] <
+          dcgm_metadata_.fail_threshold_) {
+        double power_usage = field_values[1].value.dbl;
+        if ((field_values[1].status == DCGM_ST_OK) &&
+            (!DCGM_FP64_IS_BLANK(power_usage))) {
+          dcgm_metadata_.power_usage_fail_cnt_[didx] = 0;
+        } else {
+          dcgm_metadata_.power_usage_fail_cnt_[didx]++;
+          power_usage = 0;
+          dcgmReturn_t status = dcgmReturn_t(field_values[1].status);
+          LOG_WARNING << "Unable to get power usage for GPU " << cuda_id
+                      << ". Status:" << errorString(status)
+                      << ", value:" << dcgmValueToErrorMessage(power_usage);
+        }
+        gpu_power_usage_[didx]->Set(power_usage);
+      }
+
+      // Energy Consumption
+      if (dcgm_metadata_.energy_fail_cnt_[didx] <
+          dcgm_metadata_.fail_threshold_) {
+        int64_t energy = field_values[2].value.i64;
+        if ((field_values[2].status == DCGM_ST_OK) &&
+            (!DCGM_INT64_IS_BLANK(energy))) {
+          dcgm_metadata_.energy_fail_cnt_[didx] = 0;
+          if (dcgm_metadata_.last_energy_[didx] == 0) {
+            dcgm_metadata_.last_energy_[didx] = energy;
+          }
+          gpu_energy_consumption_[didx]->Increment(
+              (double)(energy - dcgm_metadata_.last_energy_[didx]) * 0.001);
+          dcgm_metadata_.last_energy_[didx] = energy;
+        } else {
+          dcgm_metadata_.energy_fail_cnt_[didx]++;
+          energy = 0;
+          dcgmReturn_t status = dcgmReturn_t(field_values[2].status);
+          LOG_WARNING << "Unable to get energy consumption for "
+                      << "GPU " << cuda_id << ". Status:" << errorString(status)
+                      << ", value:" << dcgmValueToErrorMessage(energy);
+        }
+      }
+
+      // Utilization
+      if (dcgm_metadata_.util_fail_cnt_[didx] <
+          dcgm_metadata_.fail_threshold_) {
+        int64_t util = field_values[3].value.i64;
+        if ((field_values[3].status == DCGM_ST_OK) &&
+            (!DCGM_INT64_IS_BLANK(util))) {
+          dcgm_metadata_.util_fail_cnt_[didx] = 0;
+        } else {
+          dcgm_metadata_.util_fail_cnt_[didx]++;
+          util = 0;
+          dcgmReturn_t status = dcgmReturn_t(field_values[3].status);
+          LOG_WARNING << "Unable to get GPU utilization for GPU " << cuda_id
+                      << ". Status:" << errorString(status)
+                      << ", value:" << dcgmValueToErrorMessage(util);
+        }
+        gpu_utilization_[didx]->Set((double)util * 0.01);
+      }
+
+      // Memory Usage
+      if (dcgm_metadata_.mem_fail_cnt_[didx] < dcgm_metadata_.fail_threshold_) {
+        int64_t memory_used = field_values[4].value.i64;
+        int64_t memory_total = field_values[5].value.i64;
+        if ((field_values[4].status == DCGM_ST_OK) &&
+            (!DCGM_INT64_IS_BLANK(memory_used)) &&
+            (field_values[5].status == DCGM_ST_OK) &&
+            (!DCGM_INT64_IS_BLANK(memory_total))) {
+          dcgm_metadata_.mem_fail_cnt_[didx] = 0;
+        } else {
+          memory_total = 0;
+          memory_used = 0;
+          dcgm_metadata_.mem_fail_cnt_[didx]++;
+          dcgmReturn_t usageStatus = dcgmReturn_t(field_values[4].status);
+          dcgmReturn_t memoryTotaltatus = dcgmReturn_t(field_values[5].status);
+          LOG_WARNING << "Unable to get memory usage for GPU " << cuda_id
+                      << ". Memory usage status:" << errorString(usageStatus)
+                      << ", value:" << dcgmValueToErrorMessage(memory_used)
+                      << ". Memory total status:"
+                      << errorString(memoryTotaltatus)
+                      << ", value:" << dcgmValueToErrorMessage(memory_total);
+        }
+        gpu_memory_total_[didx]->Set(memory_total * 1024 * 1024);  // bytes
+        gpu_memory_used_[didx]->Set(memory_used * 1024 * 1024);    // bytes
+      }
+    }
+  }
+  return true;
+#endif  // TRITON_ENABLE_METRICS_GPU
+}
+
+bool
+Metrics::InitializeCacheMetrics(
+    std::shared_ptr<RequestResponseCache> response_cache)
+{
+  if (response_cache == nullptr) {
+    LOG_WARNING
+        << "error initializing cache metrics, cache metrics will not be "
+        << "available: cache was nullptr";
+    return false;
+  }
+
+  const std::map<std::string, std::string> cache_labels;
+  cache_num_entries_global_ = &cache_num_entries_family_.Add(cache_labels);
+  cache_num_lookups_global_ = &cache_num_lookups_family_.Add(cache_labels);
+  cache_num_hits_global_ = &cache_num_hits_family_.Add(cache_labels);
+  cache_num_misses_global_ = &cache_num_misses_family_.Add(cache_labels);
+  cache_num_evictions_global_ = &cache_num_evictions_family_.Add(cache_labels);
+  cache_lookup_duration_us_global_ =
+      &cache_lookup_duration_us_family_.Add(cache_labels);
+  cache_insertion_duration_us_global_ =
+      &cache_insertion_duration_us_family_.Add(cache_labels);
+  cache_util_global_ = &cache_util_family_.Add(cache_labels);
+  LOG_INFO << "Collecting Response Cache metrics";
+  return true;
+}
+
+bool
+Metrics::InitializeCpuMetrics()
+{
+#ifndef TRITON_ENABLE_METRICS_CPU
+  return false;
+#else
+  const std::map<std::string, std::string> cpu_labels;
+  cpu_utilization_ = &cpu_utilization_family_.Add(cpu_labels);
+  cpu_memory_total_ = &cpu_memory_total_family_.Add(cpu_labels);
+  cpu_memory_used_ = &cpu_memory_used_family_.Add(cpu_labels);
+
+  // Get baseline CPU info for future comparisons
+  last_cpu_info_ = CpuInfo();
+  auto status = ParseCpuInfo(last_cpu_info_);
+  if (!status.IsOk()) {
+    LOG_WARNING << "error initializing CPU metrics, CPU utilization may not "
+                   "be available: "
+                << status.Message();
+    return false;
+  }
+
+  // Verify memory metrics can be parsed
+  auto mem_info = MemInfo();
+  status = ParseMemInfo(mem_info);
+  if (!status.IsOk()) {
+    LOG_WARNING << "error initializing CPU metrics, CPU memory metrics may not "
+                   "be available: "
+                << status.Message();
+    return false;
+  }
+
+  LOG_INFO << "Collecting CPU metrics";
+  return true;
+#endif  // TRITON_ENABLE_METRICS_CPU
+}
+
+bool
+Metrics::InitializeDcgmMetrics()
+{
+#ifndef TRITON_ENABLE_METRICS_GPU
+  return false;
+#else
+  dcgmReturn_t dcgmerr = dcgmInit();
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "error initializing DCGM, GPU metrics will not be "
+                << "available: " << errorString(dcgmerr);
+    return false;
+  }
+
+  if (dcgm_metadata_.standalone_) {
+    char hostIpAddress[16] = {0};
+    std::string ipAddress = "127.0.0.1";
+    strncpy(hostIpAddress, ipAddress.c_str(), 15);
+    dcgmerr = dcgmConnect(hostIpAddress, &dcgm_metadata_.dcgm_handle_);
+  } else {
+    dcgmerr = dcgmStartEmbedded(
+        DCGM_OPERATION_MODE_MANUAL, &dcgm_metadata_.dcgm_handle_);
+  }
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "DCGM unable to start: " << errorString(dcgmerr);
+    return false;
+  } else {
+    // Set this flag to signal DCGM cleanup in destructor
+    dcgm_metadata_.dcgm_initialized_ = true;
+  }
+
+  if (dcgm_metadata_.standalone_) {
+    dcgmerr = dcgmUpdateAllFields(dcgm_metadata_.dcgm_handle_, 1);
+    if (dcgmerr != DCGM_ST_OK) {
+      LOG_WARNING << "DCGM unable to update all fields, GPU metrics will "
+                     "not be available: "
+                  << errorString(dcgmerr);
+      return false;
+    }
+  }
+
+  unsigned int dcgm_gpu_ids[DCGM_MAX_NUM_DEVICES];
+  int dcgm_gpu_count;
+  dcgmerr = dcgmGetAllDevices(
+      dcgm_metadata_.dcgm_handle_, dcgm_gpu_ids, &dcgm_gpu_count);
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "DCGM unable to get device info and count, GPU "
+                   "metrics will not be available: "
+                << errorString(dcgmerr);
+    return false;
+  }
+
+  // Get PCI Bus ID to DCGM device Id map.
+  // Some devices may have problems using DCGM API and
+  // these devices needs to be ignored.
+  std::map<std::string, size_t> pci_bus_id_to_dcgm_id;
+  std::map<std::string, std::map<std::string, std::string> >
+      pci_bus_id_to_gpu_labels;
+  std::map<std::string, std::string> pci_bus_id_to_device_name;
+  dcgmDeviceAttributes_t gpu_attributes[DCGM_MAX_NUM_DEVICES];
+  for (int i = 0; i < dcgm_gpu_count; i++) {
+    gpu_attributes[i].version = dcgmDeviceAttributes_version;
+    dcgmerr = dcgmGetDeviceAttributes(
+        dcgm_metadata_.dcgm_handle_, dcgm_gpu_ids[i], &gpu_attributes[i]);
+    if (dcgmerr != DCGM_ST_OK) {
+      LOG_WARNING << "DCGM unable to get device properties for DCGM device "
+                  << dcgm_gpu_ids[i]
+                  << ", GPU metrics will not be available for this device: "
+                  << errorString(dcgmerr);
+    } else {
+      std::string pciBusId = gpu_attributes[i].identifiers.pciBusId;
+      pci_bus_id_to_dcgm_id[pciBusId] = i;
+      pci_bus_id_to_device_name[pciBusId] =
+          std::string(gpu_attributes[i].identifiers.deviceName);
+      std::map<std::string, std::string> gpu_labels;
+      gpu_labels.insert(std::map<std::string, std::string>::value_type(
+          kMetricsLabelGpuUuid,
+          std::string(gpu_attributes[i].identifiers.uuid)));
+      pci_bus_id_to_gpu_labels[pciBusId] = gpu_labels;
+    }
+  }
+
+
+  // Get CUDA-visible PCI Bus Ids and get DCGM metrics for each CUDA-visible GPU
+  int cuda_gpu_count;
+  cudaError_t cudaerr = cudaGetDeviceCount(&cuda_gpu_count);
+  if (cudaerr != cudaSuccess) {
+    LOG_WARNING
+        << "Cannot get CUDA device count, GPU metrics will not be available";
+    return false;
+  }
+  for (int i = 0; i < cuda_gpu_count; ++i) {
+    std::string pci_bus_id = "0000";  // pad 0's for uniformity
+    char pcibusid_str[64];
+    cudaerr = cudaDeviceGetPCIBusId(pcibusid_str, sizeof(pcibusid_str) - 1, i);
+    if (cudaerr == cudaSuccess) {
+      pci_bus_id.append(pcibusid_str);
+      if (pci_bus_id_to_dcgm_id.count(pci_bus_id) <= 0) {
+        LOG_INFO << "Skipping GPU:" << i
+                 << " since it's not CUDA enabled. This should never happen!";
+        continue;
+      }
+      // Filter out CUDA visible GPUs from GPUs found by DCGM
+      LOG_INFO << "Collecting metrics for GPU " << i << ": "
+               << pci_bus_id_to_device_name[pci_bus_id];
+      auto& gpu_labels = pci_bus_id_to_gpu_labels[pci_bus_id];
+      gpu_utilization_.push_back(&gpu_utilization_family_.Add(gpu_labels));
+      gpu_memory_total_.push_back(&gpu_memory_total_family_.Add(gpu_labels));
+      gpu_memory_used_.push_back(&gpu_memory_used_family_.Add(gpu_labels));
+      gpu_power_usage_.push_back(&gpu_power_usage_family_.Add(gpu_labels));
+      gpu_power_limit_.push_back(&gpu_power_limit_family_.Add(gpu_labels));
+      gpu_energy_consumption_.push_back(
+          &gpu_energy_consumption_family_.Add(gpu_labels));
+      uint32_t dcgm_id = pci_bus_id_to_dcgm_id[pci_bus_id];
+      dcgm_metadata_.cuda_ids_to_dcgm_ids_[i] = dcgm_id;
+      dcgm_metadata_.available_cuda_gpu_ids_.emplace_back(i);
+    } else {
+      LOG_WARNING << "GPU metrics will not be available for device:" << i;
+    }
+  }
+
+  // create a gpu group
+  char groupName[] = "dcgm_group";
+  dcgmerr = dcgmGroupCreate(
+      dcgm_metadata_.dcgm_handle_, DCGM_GROUP_DEFAULT, groupName,
+      &dcgm_metadata_.groupId_);
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "Cannot make GPU group: " << errorString(dcgmerr);
+  }
+
+  // Initialize tracking vectors
+  for (unsigned int didx = 0;
+       didx < dcgm_metadata_.available_cuda_gpu_ids_.size(); ++didx) {
+    dcgm_metadata_.power_limit_fail_cnt_.push_back(0);
+    dcgm_metadata_.power_usage_fail_cnt_.push_back(0);
+    dcgm_metadata_.energy_fail_cnt_.push_back(0);
+    dcgm_metadata_.util_fail_cnt_.push_back(0);
+    dcgm_metadata_.mem_fail_cnt_.push_back(0);
+    dcgm_metadata_.last_energy_.push_back(0);
+  }
+
+  // Number of fields for DCGM to use from fields_ below
+  dcgm_metadata_.field_count_ = 6;
+  unsigned short util_flag = dcgm_metadata_.standalone_
+                                 ? DCGM_FI_PROF_GR_ENGINE_ACTIVE
+                                 : DCGM_FI_DEV_GPU_UTIL;
+  dcgm_metadata_.fields_ = {
+      DCGM_FI_DEV_POWER_MGMT_LIMIT,          // power limit, watts
+      DCGM_FI_DEV_POWER_USAGE,               // power usage, watts
+      DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,  // Total energy consumption, mJ
+      util_flag,                             // util ratio, 1 = 1%
+      DCGM_FI_DEV_FB_USED,                   // Frame buffer used, MiB
+      DCGM_FI_DEV_FB_TOTAL,                  // Frame buffer used, MiB
+  };
+
+  char fieldName[] = "field_group";
+  dcgmFieldGrp_t fieldGroupId;
+  dcgmerr = dcgmFieldGroupCreate(
+      dcgm_metadata_.dcgm_handle_, dcgm_metadata_.field_count_,
+      dcgm_metadata_.fields_.data(), fieldName, &fieldGroupId);
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "Cannot make field group: " << errorString(dcgmerr);
+  }
+
+  dcgmerr = dcgmWatchFields(
+      dcgm_metadata_.dcgm_handle_, dcgm_metadata_.groupId_, fieldGroupId,
+      metrics_interval_ms_ * 1000 /*update period, usec*/,
+      5.0 /*maxKeepAge, sec*/, 5 /*maxKeepSamples*/);
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_WARNING << "Cannot start watching fields: " << errorString(dcgmerr);
+    return false;
+  }
+
+  return true;
+#endif  // TRITON_ENABLE_METRICS_GPU
+}
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+std::string
+Metrics::dcgmValueToErrorMessage(double val)
+{
+  if (DCGM_FP64_IS_BLANK(val)) {
+    if (val == DCGM_FP64_BLANK) {
+      return "Not Specified";
+    } else if (val == DCGM_FP64_NOT_FOUND) {
+      return "Not Found";
+    } else if (val == DCGM_FP64_NOT_SUPPORTED) {
+      return "Not Supported";
+    } else if (val == DCGM_FP64_NOT_PERMISSIONED) {
+      return "Insf. Permission";
+    } else {
+      return "Unknown";
+    }
+  } else {
+    return std::to_string(val);
+  }
+}
+
+std::string
+Metrics::dcgmValueToErrorMessage(int64_t val)
+{
+  if (DCGM_INT64_IS_BLANK(val)) {
+    switch (val) {
+      case DCGM_INT64_BLANK:
+        return "Not Specified";
+      case DCGM_INT64_NOT_FOUND:
+        return "Not Found";
+      case DCGM_INT64_NOT_SUPPORTED:
+        return "Not Supported";
+      case DCGM_INT64_NOT_PERMISSIONED:
+        return "Insf. Permission";
+      default:
+        return "Unknown";
+    }
+  } else {
+    return std::to_string(val);
+  }
+}
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+bool
+Metrics::UUIDForCudaDevice(int cuda_device, std::string* uuid)
+{
+  // If metrics were not initialized then just silently fail since
+  // with DCGM we can't get the CUDA device (and not worth doing
+  // anyway since metrics aren't being reported).
+  auto singleton = GetSingleton();
+  if (!singleton->gpu_metrics_enabled_) {
+    return false;
+  }
+
+  // If GPU metrics is not enabled just silently fail.
+#ifndef TRITON_ENABLE_METRICS_GPU
+  return false;
+#else
+
+  dcgmDeviceAttributes_t gpu_attributes;
+  gpu_attributes.version = dcgmDeviceAttributes_version;
+  dcgmReturn_t dcgmerr = dcgmGetDeviceAttributes(
+      singleton->dcgm_metadata_.dcgm_handle_, cuda_device, &gpu_attributes);
+  if (dcgmerr != DCGM_ST_OK) {
+    LOG_ERROR << "Unable to get device UUID: " << errorString(dcgmerr);
+    return false;
+  }
+
+  *uuid = gpu_attributes.identifiers.uuid;
+  return true;
+#endif  // TRITON_ENABLE_METRICS_GPU
+}
+
+std::shared_ptr<prometheus::Registry>
+Metrics::GetRegistry()
+{
+  auto singleton = Metrics::GetSingleton();
+  return singleton->registry_;
+}
+
+const std::string
+Metrics::SerializedMetrics()
+{
+  auto singleton = Metrics::GetSingleton();
+  return singleton->serializer_->Serialize(
+      singleton->registry_.get()->Collect());
+}
+
+Metrics*
+Metrics::GetSingleton()
+{
+  static Metrics singleton;
+  return &singleton;
+}
+
+}}  // namespace triton::core
+
+#endif  // TRITON_ENABLE_METRICS
--- a/3rdparty/core-r22.12/src/metrics.h
+++ b/3rdparty/core-r22.12/src/metrics.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#ifdef TRITON_ENABLE_METRICS
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include "prometheus/counter.h"
+#include "prometheus/gauge.h"
+#include "prometheus/registry.h"
+#include "prometheus/serializer.h"
+#include "prometheus/text_serializer.h"
+#include "response_cache.h"
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+#include <dcgm_agent.h>
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+namespace triton { namespace core {
+
+#ifdef TRITON_ENABLE_METRICS_CPU
+using MemInfo = std::unordered_map<std::string, uint64_t>;
+
+// References:
+// - htop source: https://stackoverflow.com/a/23376195
+// - Linux docs: https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+// guest/guestnice values are counted in user/nice so we skip parsing them
+struct CpuInfo {
+  uint64_t user = 0;     // normal processes executing in user mode
+  uint64_t nice = 0;     // niced processes executing in user mode
+  uint64_t system = 0;   // processes executing in kernel mode
+  uint64_t idle = 0;     // twiddling thumbs
+  uint64_t iowait = 0;   // waiting for I/O to complete
+  uint64_t irq = 0;      // servicing interrupts
+  uint64_t softirq = 0;  // servicing softirqs
+  uint64_t steal = 0;    // involuntary wait
+};
+
+inline std::istream&
+operator>>(std::istream& is, CpuInfo& info)
+{
+  is >> info.user >> info.nice >> info.system >> info.idle >> info.iowait >>
+      info.irq >> info.softirq >> info.steal;
+  return is;
+}
+#endif  // TRITON_ENABLE_METRICS_CPU
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+struct DcgmMetadata {
+  // DCGM handles for initialization and destruction
+  dcgmHandle_t dcgm_handle_ = 0;
+  dcgmGpuGrp_t groupId_ = 0;
+  // DCGM Flags
+  bool standalone_ = false;
+  // DCGM Fields
+  size_t field_count_ = 0;
+  std::vector<unsigned short> fields_;
+  // GPU Device Mapping
+  std::map<uint32_t, uint32_t> cuda_ids_to_dcgm_ids_;
+  std::vector<uint32_t> available_cuda_gpu_ids_;
+  // Stop attempting metrics if they fail multiple consecutive
+  // times for a device.
+  const int fail_threshold_ = 3;
+  // DCGM Failure Tracking
+  std::vector<int> power_limit_fail_cnt_;
+  std::vector<int> power_usage_fail_cnt_;
+  std::vector<int> energy_fail_cnt_;
+  std::vector<int> util_fail_cnt_;
+  std::vector<int> mem_fail_cnt_;
+  // DCGM Energy Tracking
+  std::vector<unsigned long long> last_energy_;
+  // Track if DCGM handle initialized successfully
+  bool dcgm_initialized_ = false;
+};
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+class Metrics {
+ public:
+  // Return the hash value of the labels
+  static size_t HashLabels(const std::map<std::string, std::string>& labels);
+
+  // Are metrics enabled?
+  static bool Enabled();
+
+  // Enable reporting of metrics
+  static void EnableMetrics();
+
+  // Enable reporting of GPU metrics
+  static void EnableGPUMetrics();
+
+  // Enable reporting of CPU metrics
+  static void EnableCpuMetrics();
+
+  // Enable reporting of Cache metrics
+  static void EnableCacheMetrics(
+      std::shared_ptr<RequestResponseCache> response_cache);
+
+  // Start a thread for polling enabled metrics if any
+  static void StartPollingThreadSingleton(
+      std::shared_ptr<RequestResponseCache> response_cache);
+
+  // Set the time interval in secs at which metrics are collected
+  static void SetMetricsInterval(uint64_t metrics_interval_ms);
+
+  // Get the prometheus registry
+  static std::shared_ptr<prometheus::Registry> GetRegistry();
+
+  // Get serialized metrics
+  static const std::string SerializedMetrics();
+
+  // Get the UUID for a CUDA device. Return true and initialize 'uuid'
+  // if a UUID is found, return false if a UUID cannot be returned.
+  static bool UUIDForCudaDevice(int cuda_device, std::string* uuid);
+
+  // Metric family counting successful inference requests
+  static prometheus::Family<prometheus::Counter>& FamilyInferenceSuccess()
+  {
+    return GetSingleton()->inf_success_family_;
+  }
+
+  // Metric family counting failed inference requests
+  static prometheus::Family<prometheus::Counter>& FamilyInferenceFailure()
+  {
+    return GetSingleton()->inf_failure_family_;
+  }
+
+  // Metric family counting inferences performed, where a batch-size
+  // 'n' inference request is counted as 'n' inferences
+  static prometheus::Family<prometheus::Counter>& FamilyInferenceCount()
+  {
+    return GetSingleton()->inf_count_family_;
+  }
+
+  // Metric family counting inferences performed, where a batch-size
+  // 'n' inference request is counted as 'n' inferences
+  static prometheus::Family<prometheus::Counter>&
+  FamilyInferenceExecutionCount()
+  {
+    return GetSingleton()->inf_count_exec_family_;
+  }
+
+  // Metric family of cumulative inference request duration, in
+  // microseconds
+  static prometheus::Family<prometheus::Counter>&
+  FamilyInferenceRequestDuration()
+  {
+    return GetSingleton()->inf_request_duration_us_family_;
+  }
+
+  // Metric family of cumulative inference queuing duration, in
+  // microseconds
+  static prometheus::Family<prometheus::Counter>& FamilyInferenceQueueDuration()
+  {
+    return GetSingleton()->inf_queue_duration_us_family_;
+  }
+
+  // Metric family of cumulative inference compute durations, in
+  // microseconds
+  static prometheus::Family<prometheus::Counter>&
+  FamilyInferenceComputeInputDuration()
+  {
+    return GetSingleton()->inf_compute_input_duration_us_family_;
+  }
+  static prometheus::Family<prometheus::Counter>&
+  FamilyInferenceComputeInferDuration()
+  {
+    return GetSingleton()->inf_compute_infer_duration_us_family_;
+  }
+  static prometheus::Family<prometheus::Counter>&
+  FamilyInferenceComputeOutputDuration()
+  {
+    return GetSingleton()->inf_compute_output_duration_us_family_;
+  }
+  // Metric families of per-model response cache metrics
+  static prometheus::Family<prometheus::Counter>& FamilyCacheHitCount()
+  {
+    return GetSingleton()->cache_num_hits_model_family_;
+  }
+  static prometheus::Family<prometheus::Counter>& FamilyCacheHitLookupDuration()
+  {
+    return GetSingleton()->cache_hit_lookup_duration_us_model_family_;
+  }
+  static prometheus::Family<prometheus::Counter>& FamilyCacheMissCount()
+  {
+    return GetSingleton()->cache_num_misses_model_family_;
+  }
+  static prometheus::Family<prometheus::Counter>&
+  FamilyCacheMissLookupDuration()
+  {
+    return GetSingleton()->cache_miss_lookup_duration_us_model_family_;
+  }
+  static prometheus::Family<prometheus::Counter>&
+  FamilyCacheMissInsertionDuration()
+  {
+    return GetSingleton()->cache_miss_insertion_duration_us_model_family_;
+  }
+
+
+ private:
+  Metrics();
+  virtual ~Metrics();
+  static Metrics* GetSingleton();
+  bool InitializeDcgmMetrics();
+  bool InitializeCpuMetrics();
+  bool InitializeCacheMetrics(
+      std::shared_ptr<RequestResponseCache> response_cache);
+  bool StartPollingThread(std::shared_ptr<RequestResponseCache> response_cache);
+  bool PollCacheMetrics(std::shared_ptr<RequestResponseCache> response_cache);
+  bool PollDcgmMetrics();
+  bool PollCpuMetrics();
+
+  std::string dcgmValueToErrorMessage(double val);
+  std::string dcgmValueToErrorMessage(int64_t val);
+
+  std::shared_ptr<prometheus::Registry> registry_;
+  std::unique_ptr<prometheus::Serializer> serializer_;
+
+  prometheus::Family<prometheus::Counter>& inf_success_family_;
+  prometheus::Family<prometheus::Counter>& inf_failure_family_;
+  prometheus::Family<prometheus::Counter>& inf_count_family_;
+  prometheus::Family<prometheus::Counter>& inf_count_exec_family_;
+  prometheus::Family<prometheus::Counter>& inf_request_duration_us_family_;
+  prometheus::Family<prometheus::Counter>& inf_queue_duration_us_family_;
+  prometheus::Family<prometheus::Counter>&
+      inf_compute_input_duration_us_family_;
+  prometheus::Family<prometheus::Counter>&
+      inf_compute_infer_duration_us_family_;
+  prometheus::Family<prometheus::Counter>&
+      inf_compute_output_duration_us_family_;
+  // Global Response Cache metrics
+  prometheus::Family<prometheus::Gauge>& cache_num_entries_family_;
+  prometheus::Family<prometheus::Gauge>& cache_num_lookups_family_;
+  prometheus::Family<prometheus::Gauge>& cache_num_hits_family_;
+  prometheus::Family<prometheus::Gauge>& cache_num_misses_family_;
+  prometheus::Family<prometheus::Gauge>& cache_num_evictions_family_;
+  prometheus::Family<prometheus::Gauge>& cache_lookup_duration_us_family_;
+  prometheus::Family<prometheus::Gauge>& cache_insertion_duration_us_family_;
+  prometheus::Family<prometheus::Gauge>& cache_util_family_;
+  // Gauges for Global Response Cache metrics
+  prometheus::Gauge* cache_num_entries_global_;
+  prometheus::Gauge* cache_num_lookups_global_;
+  prometheus::Gauge* cache_num_hits_global_;
+  prometheus::Gauge* cache_num_misses_global_;
+  prometheus::Gauge* cache_num_evictions_global_;
+  prometheus::Gauge* cache_lookup_duration_us_global_;
+  prometheus::Gauge* cache_insertion_duration_us_global_;
+  prometheus::Gauge* cache_util_global_;
+  // Per-model Response Cache metrics
+  prometheus::Family<prometheus::Counter>& cache_num_hits_model_family_;
+  prometheus::Family<prometheus::Counter>&
+      cache_hit_lookup_duration_us_model_family_;
+  prometheus::Family<prometheus::Counter>& cache_num_misses_model_family_;
+  prometheus::Family<prometheus::Counter>&
+      cache_miss_lookup_duration_us_model_family_;
+  prometheus::Family<prometheus::Counter>&
+      cache_miss_insertion_duration_us_model_family_;
+
+#ifdef TRITON_ENABLE_METRICS_GPU
+  prometheus::Family<prometheus::Gauge>& gpu_utilization_family_;
+  prometheus::Family<prometheus::Gauge>& gpu_memory_total_family_;
+  prometheus::Family<prometheus::Gauge>& gpu_memory_used_family_;
+  prometheus::Family<prometheus::Gauge>& gpu_power_usage_family_;
+  prometheus::Family<prometheus::Gauge>& gpu_power_limit_family_;
+  prometheus::Family<prometheus::Counter>& gpu_energy_consumption_family_;
+
+  std::vector<prometheus::Gauge*> gpu_utilization_;
+  std::vector<prometheus::Gauge*> gpu_memory_total_;
+  std::vector<prometheus::Gauge*> gpu_memory_used_;
+  std::vector<prometheus::Gauge*> gpu_power_usage_;
+  std::vector<prometheus::Gauge*> gpu_power_limit_;
+  std::vector<prometheus::Counter*> gpu_energy_consumption_;
+
+  DcgmMetadata dcgm_metadata_;
+#endif  // TRITON_ENABLE_METRICS_GPU
+
+#ifdef TRITON_ENABLE_METRICS_CPU
+  // Parses "/proc/meminfo" for metrics, currently only supported on Linux.
+  Status ParseMemInfo(MemInfo& info);
+  // Parses "/proc/stat" for metrics, currently only supported on Linux.
+  Status ParseCpuInfo(CpuInfo& info);
+  // Computes CPU utilization between "info_new" and "info_old" values
+  double CpuUtilization(const CpuInfo& info_new, const CpuInfo& info_old);
+
+  prometheus::Family<prometheus::Gauge>& cpu_utilization_family_;
+  prometheus::Family<prometheus::Gauge>& cpu_memory_total_family_;
+  prometheus::Family<prometheus::Gauge>& cpu_memory_used_family_;
+
+  prometheus::Gauge* cpu_utilization_;
+  prometheus::Gauge* cpu_memory_total_;
+  prometheus::Gauge* cpu_memory_used_;
+  CpuInfo last_cpu_info_;
+#endif  // TRITON_ENABLE_METRICS_CPU
+
+  // Thread for polling cache/gpu metrics periodically
+  std::unique_ptr<std::thread> poll_thread_;
+  std::atomic<bool> poll_thread_exit_;
+  bool metrics_enabled_;
+  bool gpu_metrics_enabled_;
+  bool cpu_metrics_enabled_;
+  bool cache_metrics_enabled_;
+  bool poll_thread_started_;
+  std::mutex metrics_enabling_;
+  std::mutex poll_thread_starting_;
+  uint64_t metrics_interval_ms_;
+};
+
+}}  // namespace triton::core
+
+#endif  // TRITON_ENABLE_METRICS
--- a/3rdparty/core-r22.12/src/model.cc
+++ b/3rdparty/core-r22.12/src/model.cc
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model.h"
+
+#include <chrono>
+#include <future>
+#include "constants.h"
+#include "filesystem.h"
+#include "infer_request.h"
+#include "model_config_utils.h"
+#include "triton/common/logging.h"
+
+namespace triton { namespace core {
+
+Status
+Model::GetInput(
+    const std::string& name, const inference::ModelInput** input) const
+{
+  const auto itr = input_map_.find(name);
+  if (itr == input_map_.end()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "unexpected inference input '" + name + "' for model '" + Name() + "'");
+  }
+
+  *input = &itr->second;
+  return Status::Success;
+}
+
+Status
+Model::GetOutput(
+    const std::string& name, const inference::ModelOutput** output) const
+{
+  const auto itr = output_map_.find(name);
+  if (itr == output_map_.end()) {
+    return Status(
+        Status::Code::INVALID_ARG, "unexpected inference output '" + name +
+                                       "' for model '" + Name() + "'");
+  }
+
+  *output = &itr->second;
+  return Status::Success;
+}
+
+Status
+Model::SetModelConfig(const inference::ModelConfig& config)
+{
+  config_ = config;
+  set_model_config_ = true;
+
+  return Status::Success;
+}
+
+Status
+Model::SetScheduler(std::unique_ptr<Scheduler> scheduler)
+{
+  if (scheduler_ != nullptr) {
+    return Status(
+        Status::Code::INTERNAL, "Attempt to change scheduler not allowed");
+  }
+
+  scheduler_ = std::move(scheduler);
+  return Status::Success;
+}
+
+Status
+Model::Init(const bool is_config_provided)
+{
+  if (!set_model_config_ && !is_config_provided) {
+    return Status(
+        Status::Code::NOT_FOUND,
+        "model configuration is not provided for model '" + Name() + "'");
+  }
+
+  RETURN_IF_ERROR(ValidateModelConfig(config_, min_compute_capability_));
+  RETURN_IF_ERROR(ValidateModelIOConfig(config_));
+
+  // Initialize the input map
+  for (const auto& io : config_.input()) {
+    input_map_.insert(std::make_pair(io.name(), io));
+    if (!io.optional()) {
+      ++required_input_count_;
+    }
+  }
+
+  // Initialize the output map and label provider for each output
+  label_provider_ = std::make_shared<LabelProvider>();
+  for (const auto& io : config_.output()) {
+    output_map_.insert(std::make_pair(io.name(), io));
+
+    if (!io.label_filename().empty()) {
+      const auto label_path = JoinPath({model_dir_, io.label_filename()});
+      RETURN_IF_ERROR(label_provider_->AddLabels(io.name(), label_path));
+    }
+  }
+
+  if (config_.has_dynamic_batching()) {
+    default_priority_level_ =
+        config_.dynamic_batching().default_priority_level();
+    max_priority_level_ = config_.dynamic_batching().priority_levels();
+  } else if (config_.has_ensemble_scheduling()) {
+    // For ensemble, allow any priority level to pass through
+    default_priority_level_ = 0;
+    max_priority_level_ = UINT32_MAX;
+  } else {
+    default_priority_level_ = 0;
+    max_priority_level_ = 0;
+  }
+
+  return Status::Success;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/model.h
+++ b/3rdparty/core-r22.12/src/model.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "infer_stats.h"
+#include "label_provider.h"
+#include "model_config.pb.h"
+#include "scheduler.h"
+#include "status.h"
+
+namespace triton { namespace core {
+
+class InferenceRequest;
+
+//
+// Interface for models that handle inference requests.
+//
+class Model {
+ public:
+  explicit Model(
+      const double min_compute_capability, const std::string& model_dir,
+      const int64_t version, const inference::ModelConfig& config)
+      : config_(config), min_compute_capability_(min_compute_capability),
+        version_(version), required_input_count_(0), model_dir_(model_dir),
+        set_model_config_(false)
+  {
+  }
+  virtual ~Model() {}
+
+  // Get the name of model being served.
+  const std::string& Name() const { return config_.name(); }
+
+  // Get the version of model being served.
+  int64_t Version() const { return version_; }
+
+  // Get the configuration of model being served.
+  const inference::ModelConfig& Config() const { return config_; }
+
+  // Get the number of required inputs
+  size_t RequiredInputCount() const { return required_input_count_; }
+
+  // Get the stats collector for the model being served.
+  InferenceStatsAggregator* MutableStatsAggregator()
+  {
+    return &stats_aggregator_;
+  }
+  const InferenceStatsAggregator& StatsAggregator() const
+  {
+    return stats_aggregator_;
+  }
+
+  // Get the model configuration for a named input.
+  Status GetInput(
+      const std::string& name, const inference::ModelInput** input) const;
+
+  // Get the model configuration for a named output.
+  Status GetOutput(
+      const std::string& name, const inference::ModelOutput** output) const;
+
+  // Get a label provider for the model.
+  const std::shared_ptr<LabelProvider>& GetLabelProvider() const
+  {
+    return label_provider_;
+  }
+
+  // Initialize the instance for Triton core usage
+  Status Init(const bool is_config_provided);
+
+  // Enqueue a request for execution. If Status::Success is returned
+  // then the model has taken ownership of the request object and so
+  // 'request' will be nullptr. If non-success is returned then the
+  // caller still retains ownership of 'request'.
+  Status Enqueue(std::unique_ptr<InferenceRequest>& request)
+  {
+    return scheduler_->Enqueue(request);
+  }
+
+  // Return the number of in-flight inferences.
+  size_t InflightInferenceCount()
+  {
+    return scheduler_->InflightInferenceCount();
+  }
+
+  // Stop processing future requests unless they are considered as in-flight.
+  void Stop() { scheduler_->Stop(); }
+
+  uint32_t DefaultPriorityLevel() const { return default_priority_level_; }
+
+  uint32_t MaxPriorityLevel() const { return max_priority_level_; }
+
+ protected:
+  // Set the configuration of the model being served.
+  Status SetModelConfig(const inference::ModelConfig& config);
+
+  // Explicitly set the scheduler to use for inference requests to the
+  // model. The scheduler can only be set once for a model.
+  Status SetScheduler(std::unique_ptr<Scheduler> scheduler);
+
+  // The scheduler to use for this model.
+  std::unique_ptr<Scheduler> scheduler_;
+
+  // Configuration of the model.
+  inference::ModelConfig config_;
+
+ private:
+  // The minimum supported CUDA compute capability.
+  const double min_compute_capability_;
+
+  // Version of the model.
+  int64_t version_;
+
+  // The stats collector for the model.
+  InferenceStatsAggregator stats_aggregator_;
+
+  // Label provider for this model.
+  std::shared_ptr<LabelProvider> label_provider_;
+
+  size_t required_input_count_;
+
+  // Map from input name to the model configuration for that input.
+  std::unordered_map<std::string, inference::ModelInput> input_map_;
+
+  // Map from output name to the model configuration for that output.
+  std::unordered_map<std::string, inference::ModelOutput> output_map_;
+
+  // Path to model
+  std::string model_dir_;
+
+  // The default priority level for the model.
+  uint32_t default_priority_level_;
+
+  // The largest priority value for the model.
+  uint32_t max_priority_level_;
+
+  // Whether or not model config has been set.
+  bool set_model_config_;
+};
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/model_config_cuda.cc
+++ b/3rdparty/core-r22.12/src/model_config_cuda.cc
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_config_cuda.h"
+
+#include <cuda_runtime_api.h>
+
+namespace triton { namespace core {
+
+int
+GetCudaStreamPriority(
+    inference::ModelOptimizationPolicy::ModelPriority priority)
+{
+  // Default priority is 0
+  int cuda_stream_priority = 0;
+
+  int min, max;
+  cudaError_t cuerr = cudaDeviceGetStreamPriorityRange(&min, &max);
+  if ((cuerr != cudaErrorNoDevice) && (cuerr != cudaSuccess)) {
+    return 0;
+  }
+
+  switch (priority) {
+    case inference::ModelOptimizationPolicy::PRIORITY_MAX:
+      cuda_stream_priority = max;
+      break;
+    case inference::ModelOptimizationPolicy::PRIORITY_MIN:
+      cuda_stream_priority = min;
+      break;
+    default:
+      cuda_stream_priority = 0;
+      break;
+  }
+
+  return cuda_stream_priority;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/model_config_cuda.h
+++ b/3rdparty/core-r22.12/src/model_config_cuda.h
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <stdint.h>
+#include "model_config.pb.h"
+
+namespace triton { namespace core {
+
+/// Get the CUDA stream priority for a given ModelPriority
+/// \param priority The inference::ModelOptimizationPolicy::ModelPriority
+/// priority. \param cuda_stream_priority Returns the CUDA stream priority.
+/// \return The error status.
+int GetCudaStreamPriority(
+    inference::ModelOptimizationPolicy::ModelPriority priority);
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/model_config_utils.cc
+++ b/3rdparty/core-r22.12/src/model_config_utils.cc
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "model_config_utils.h"
+
+#include <google/protobuf/util/json_util.h>
+#include <deque>
+#include <mutex>
+#include <set>
+#include "constants.h"
+#include "cuda_utils.h"
+#include "filesystem.h"
+#include "triton/common/logging.h"
+
+#define TRITONJSON_STATUSTYPE triton::core::Status
+#define TRITONJSON_STATUSRETURN(M) \
+  return triton::core::Status(triton::core::Status::Code::INTERNAL, (M))
+#define TRITONJSON_STATUSSUCCESS triton::core::Status::Success
+#include "triton/common/triton_json.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace core {
+
+namespace {
+
+#ifdef TRITON_ENABLE_ENSEMBLE
+
+struct EnsembleTensor {
+  EnsembleTensor(bool isOutput) : ready(false), isOutput(isOutput) {}
+  bool ready;
+  bool isOutput;
+  std::vector<EnsembleTensor*> prev_nodes;
+  std::vector<EnsembleTensor*> next_nodes;
+};
+
+/// Build a graph that represents the data flow in the ensemble specified in
+/// given model config. the node (ensemble tensor) in the graph can be looked
+/// up using its name as key.
+/// \param ensemble_config The model configuration that specifies
+/// ensemble_scheduling field.
+/// \param keyed_ensemble_graph Returned the ensemble graph.
+/// \return The error status. A non-OK status indicates the build fails because
+/// the ensemble configuration is not valid.
+Status
+BuildEnsembleGraph(
+    const inference::ModelConfig& config,
+    std::unordered_map<std::string, EnsembleTensor>& keyed_ensemble_graph)
+{
+  keyed_ensemble_graph.clear();
+  size_t step_idx = 0;
+  for (const auto& element : config.ensemble_scheduling().step()) {
+    if (element.model_name().empty()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "must specify 'model_name' in step " + std::to_string(step_idx) +
+              " of ensemble '" + config.name() + "'");
+    }
+    if (element.input_map().size() == 0) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "must specify 'input_map' in step " + std::to_string(step_idx) +
+              " of ensemble '" + config.name() + "'");
+    }
+    if (element.output_map().size() == 0) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "must specify 'output_map' in step " + std::to_string(step_idx) +
+              " of ensemble '" + config.name() + "'");
+    }
+
+    // Link ensemble tensors
+    std::vector<EnsembleTensor*> tensor_as_output;
+    for (const auto& output_map : element.output_map()) {
+      auto it = keyed_ensemble_graph.find(output_map.second);
+      if (it != keyed_ensemble_graph.end()) {
+        if (it->second.isOutput) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "ensemble tensor '" + it->first +
+                  "' can appear in an output map only once for ensemble '" +
+                  config.name() + "' step " + std::to_string(step_idx));
+        } else {
+          it->second.isOutput = true;
+        }
+      } else {
+        it = keyed_ensemble_graph
+                 .emplace(
+                     std::make_pair(output_map.second, EnsembleTensor(true)))
+                 .first;
+      }
+      tensor_as_output.push_back(&(it->second));
+    }
+
+    std::set<std::string> model_inputs;
+    for (const auto& input_map : element.input_map()) {
+      if (model_inputs.find(input_map.first) != model_inputs.end()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "input '" + input_map.first + "' in model '" +
+                element.model_name() +
+                "' is mapped to multiple ensemble tensors for ensemble '" +
+                config.name() + "' step " + std::to_string(step_idx));
+      } else {
+        model_inputs.emplace(input_map.first);
+      }
+      auto it = keyed_ensemble_graph.find(input_map.second);
+      if (it == keyed_ensemble_graph.end()) {
+        it = keyed_ensemble_graph
+                 .emplace(
+                     std::make_pair(input_map.second, EnsembleTensor(false)))
+                 .first;
+      }
+      for (auto output : tensor_as_output) {
+        output->prev_nodes.push_back(&(it->second));
+        it->second.next_nodes.push_back(output);
+      }
+    }
+
+    step_idx++;
+  }
+
+  return Status::Success;
+}
+
+Status
+ValidateEnsembleSchedulingConfig(const inference::ModelConfig& config)
+{
+  if (config.platform() != kEnsemblePlatform) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "ensemble scheduling cannot be set for model '" + config.name() +
+            "' whose platform is not " + kEnsemblePlatform);
+  }
+  if (config.instance_group().size() != 0) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "instance group should not be specified for ensemble '" +
+            config.name() + "'");
+  }
+  if (config.has_optimization()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "optimization should not be specified for ensemble '" + config.name() +
+            "'");
+  }
+  if (config.model_warmup_size() != 0) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "model_warmup can not be specified for ensemble '" + config.name() +
+            "'");
+  }
+
+  // Make sure step is not empty and all fields are set
+  if (config.ensemble_scheduling().step_size() == 0) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify 'step' for ensemble '" + config.name() + "'");
+  }
+
+  std::unordered_map<std::string, EnsembleTensor> tensors;
+
+  RETURN_IF_ERROR(BuildEnsembleGraph(config, tensors));
+
+  // check data flow
+  std::deque<EnsembleTensor*> ready_queue;
+  for (const auto& input : config.input()) {
+    auto it = tensors.find(input.name());
+    if (it == tensors.end()) {
+      return Status(
+          Status::Code::INVALID_ARG, "ensemble input '" + input.name() +
+                                         "' for ensemble " + config.name() +
+                                         "' is not used");
+    }
+    it->second.ready = true;
+    ready_queue.push_back(&(it->second));
+  }
+  while (!ready_queue.empty()) {
+    auto& ready_node = ready_queue.front();
+    for (auto& next_node : ready_node->next_nodes) {
+      if (next_node->ready) {
+        continue;
+      }
+      bool next_node_ready = true;
+      for (auto& prev_node : next_node->prev_nodes) {
+        if (!prev_node->ready) {
+          next_node_ready = false;
+          break;
+        }
+      }
+      next_node->ready = next_node_ready;
+      if (next_node_ready) {
+        ready_queue.push_back(next_node);
+      }
+    }
+    ready_queue.pop_front();
+  }
+  std::set<std::string> outputs;
+  for (const auto& output : config.output()) {
+    auto it = tensors.find(output.name());
+    if (it == tensors.end()) {
+      return Status(
+          Status::Code::INVALID_ARG, "ensemble output '" + output.name() +
+                                         "' for ensemble " + config.name() +
+                                         "' is not used");
+    }
+    if (!it->second.ready) {
+      return Status(
+          Status::Code::INVALID_ARG, "output '" + output.name() +
+                                         "' for ensemble '" + config.name() +
+                                         "' is not written");
+    } else {
+      outputs.insert(it->first);
+    }
+  }
+  // Check redundant ensemble tensors
+  for (const auto& tensor : tensors) {
+    // skip ensemble outputs as they have been checked and can have no
+    // next nodes
+    if (outputs.find(tensor.first) != outputs.end()) {
+      continue;
+    }
+    if (!tensor.second.ready || (tensor.second.next_nodes.size() == 0)) {
+      return Status(
+          Status::Code::INVALID_ARG, "ensemble tensor '" + tensor.first +
+                                         "' is unused in ensemble '" +
+                                         config.name() + "'");
+    }
+  }
+  return Status::Success;
+}
+
+#endif  // TRITON_ENABLE_ENSEMBLE
+
+template <class ModelIO>
+Status
+ValidateIOShape(
+    const ModelIO& io, int32_t max_batch_size,
+    const std::string& message_prefix = "")
+{
+  if (io.name().empty()) {
+    return Status(
+        Status::Code::INVALID_ARG, message_prefix + "must specify 'name'");
+  }
+
+  if (io.data_type() == inference::DataType::TYPE_INVALID) {
+    return Status(
+        Status::Code::INVALID_ARG, "model output must specify 'data_type'");
+  }
+
+  if (io.dims_size() == 0) {
+    return Status(
+        Status::Code::INVALID_ARG, message_prefix + "must specify 'dims'");
+  }
+
+  // If the configuration is non-batching, then no input or output
+  // reshape can be empty as that would mean that input or output was
+  // always empty (no data).
+  if (io.has_reshape() && (io.reshape().shape_size() == 0) &&
+      (max_batch_size == 0)) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        message_prefix +
+            "cannot have empty reshape for non-batching model as scalar "
+            "tensors are not supported");
+  }
+
+  for (auto dim : io.dims()) {
+    // Dimension cannot be 0.
+    if ((dim < 1) && (dim != triton::common::WILDCARD_DIM)) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          message_prefix + "dimension must be integer >= 1, or " +
+              std::to_string(triton::common::WILDCARD_DIM) +
+              " to indicate a variable-size dimension");
+    }
+  }
+
+  if (io.has_reshape()) {
+    // Zeros are not allowed in reshape.
+    for (auto dim : io.reshape().shape()) {
+      if ((dim < 1) && (dim != triton::common::WILDCARD_DIM)) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            message_prefix + "reshape dimensions must be integer >= 1, or " +
+                std::to_string(triton::common::WILDCARD_DIM) +
+                " to indicate a variable-size dimension");
+      }
+    }
+
+    const int64_t dims_size = triton::common::GetElementCount(io.dims());
+    const int64_t reshape_size =
+        triton::common::GetElementCount(io.reshape().shape());
+
+    // dims and reshape must both have same element count
+    // or both have variable-size dimension.
+    // Special case for empty reshape... expect dims to have element
+    // count of 1.
+    if ((dims_size != reshape_size) &&
+        ((reshape_size != 0) || (dims_size != 1))) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          message_prefix + "has different size for dims and reshape");
+    }
+
+    // shape contains variable-size dimension, in this case we compare if
+    // each pair of the trunks separated by variable-size dimension has
+    // the same element count. For instance, from [2, 4, -1, 6] to [8, -1, 1, 6]
+    // is valid reshape as 2 * 4 = 8 and 6 = 1 * 6.
+    if (dims_size == -1) {
+      std::vector<int64_t> dim_element_cnts;
+      std::vector<int64_t> reshape_element_cnts;
+      int64_t current_cnt = 1;
+      for (const auto& dim : io.dims()) {
+        if (dim != -1) {
+          current_cnt *= dim;
+        } else {
+          dim_element_cnts.push_back(current_cnt);
+          current_cnt = 1;
+        }
+      }
+      dim_element_cnts.push_back(current_cnt);
+
+      current_cnt = 1;
+      for (const auto& dim : io.reshape().shape()) {
+        if (dim != -1) {
+          current_cnt *= dim;
+        } else {
+          reshape_element_cnts.push_back(current_cnt);
+          current_cnt = 1;
+        }
+      }
+      reshape_element_cnts.push_back(current_cnt);
+
+      if (dim_element_cnts.size() != reshape_element_cnts.size()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            message_prefix +
+                "has different number of variable-size dimensions for dims "
+                "and reshape");
+      }
+      for (size_t idx = 0; idx < dim_element_cnts.size(); idx++) {
+        if (dim_element_cnts[idx] != reshape_element_cnts[idx]) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              message_prefix + "has different size for dims and reshape");
+        }
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+}  // namespace
+
+Status
+GetModelVersionFromPath(const std::string& path, int64_t* version)
+{
+  auto version_dir = BaseName(path);
+
+  // Determine the version from the last segment of 'path'
+  try {
+    *version = std::atoll(version_dir.c_str());
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        "unable to determine model version from " + path);
+  }
+
+  return Status::Success;
+}
+
+Status
+GetBooleanSequenceControlProperties(
+    const inference::ModelSequenceBatching& batcher,
+    const std::string& model_name,
+    const inference::ModelSequenceBatching::Control::Kind control_kind,
+    const bool required, std::string* tensor_name,
+    inference::DataType* tensor_datatype, float* fp32_false_value,
+    float* fp32_true_value, int32_t* int32_false_value,
+    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value)
+{
+  // Make sure same tensor is not configured for multiple controls
+  std::set<std::string> seen_tensors;
+
+  // Make sure the control kind is not mentioned multiple times.
+  bool seen_control = false;
+
+  for (const auto& control_input : batcher.control_input()) {
+    if (control_input.name().empty()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor must have a name for " +
+              model_name);
+    }
+
+    if (seen_tensors.find(control_input.name()) != seen_tensors.end()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor '" + control_input.name() +
+              "' is specified for multiple control kinds for " + model_name);
+    }
+
+    seen_tensors.insert(control_input.name());
+
+    for (const auto& c : control_input.control()) {
+      if (c.kind() == control_kind) {
+        if (seen_control) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching specifies multiple " +
+                  inference::ModelSequenceBatching_Control_Kind_Name(
+                      control_kind) +
+                  " tensors for " + model_name);
+        }
+
+        *tensor_name = control_input.name();
+        seen_control = true;
+
+        // Make sure only one of int, float, or bool type is specified.
+        if (!((c.int32_false_true_size() != 0) ||
+              (c.fp32_false_true_size() != 0) ||
+              (c.bool_false_true_size() != 0))) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching must specify either 'int32_false_true', "
+              "'fp32_false_true' or 'bool_false_true' for " +
+                  inference::ModelSequenceBatching_Control_Kind_Name(
+                      control_kind) +
+                  " for " + model_name);
+        } else if (
+            ((c.int32_false_true_size() != 0) &&
+             (c.fp32_false_true_size() != 0)) ||
+            ((c.int32_false_true_size() != 0) &&
+             (c.bool_false_true_size() != 0)) ||
+            ((c.fp32_false_true_size() != 0) &&
+             (c.bool_false_true_size() != 0))) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching specifies more than one from "
+              "'int32_false_true', 'fp32_false_true' and 'bool_false_true' "
+              "for " +
+                  inference::ModelSequenceBatching_Control_Kind_Name(
+                      control_kind) +
+                  " for " + model_name);
+        }
+
+        if (c.int32_false_true_size() > 0) {
+          if (c.int32_false_true_size() != 2) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                "sequence batching control 'int32_false_true' must have "
+                "exactly 2 entries for " +
+                    inference::ModelSequenceBatching_Control_Kind_Name(
+                        control_kind) +
+                    " for " + model_name);
+          }
+
+          if (tensor_datatype != nullptr) {
+            *tensor_datatype = inference::DataType::TYPE_INT32;
+          }
+          if (int32_false_value != nullptr) {
+            *int32_false_value = c.int32_false_true(0);
+          }
+          if (int32_true_value != nullptr) {
+            *int32_true_value = c.int32_false_true(1);
+          }
+        } else if (c.fp32_false_true_size() > 0) {
+          if (c.fp32_false_true_size() != 2) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                "sequence batching control 'fp32_false_true' must have exactly "
+                "2 entries for " +
+                    inference::ModelSequenceBatching_Control_Kind_Name(
+                        control_kind) +
+                    " for " + model_name);
+          }
+
+          if (tensor_datatype != nullptr) {
+            *tensor_datatype = inference::DataType::TYPE_FP32;
+          }
+          if (fp32_false_value != nullptr) {
+            *fp32_false_value = c.fp32_false_true(0);
+          }
+          if (fp32_true_value != nullptr) {
+            *fp32_true_value = c.fp32_false_true(1);
+          }
+        } else {
+          if (c.bool_false_true_size() != 2) {
+            return Status(
+                Status::Code::INVALID_ARG,
+                "sequence batching control 'bool_false_true' must have exactly "
+                "2 entries for " +
+                    inference::ModelSequenceBatching_Control_Kind_Name(
+                        control_kind) +
+                    " for " + model_name);
+          }
+
+          if (tensor_datatype != nullptr) {
+            *tensor_datatype = inference::DataType::TYPE_BOOL;
+          }
+          if (bool_false_value != nullptr) {
+            *bool_false_value = c.bool_false_true(0);
+          }
+          if (bool_true_value != nullptr) {
+            *bool_true_value = c.bool_false_true(1);
+          }
+        }
+      }
+    }
+  }
+
+  if (!seen_control) {
+    if (required) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor must specify a " +
+              inference::ModelSequenceBatching_Control_Kind_Name(control_kind) +
+              " value for " + model_name);
+    }
+
+    tensor_name->clear();
+  }
+
+  return Status::Success;
+}
+
+Status
+GetTypedSequenceControlProperties(
+    const inference::ModelSequenceBatching& batcher,
+    const std::string& model_name,
+    const inference::ModelSequenceBatching::Control::Kind control_kind,
+    const bool required, std::string* tensor_name,
+    inference::DataType* tensor_datatype)
+{
+  // Make sure same tensor is not configured for multiple controls
+  std::set<std::string> seen_tensors;
+
+  // Make sure the control kind is not mentioned multiple times.
+  bool seen_control = false;
+
+  for (const auto& control_input : batcher.control_input()) {
+    if (control_input.name().empty()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor must have a name for " +
+              model_name);
+    }
+
+    if (seen_tensors.find(control_input.name()) != seen_tensors.end()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor '" + control_input.name() +
+              "' is specified for multiple control kinds for " + model_name);
+    }
+
+    seen_tensors.insert(control_input.name());
+
+    for (const auto& c : control_input.control()) {
+      if (c.kind() == control_kind) {
+        if (seen_control) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching specifies multiple " +
+                  inference::ModelSequenceBatching_Control_Kind_Name(
+                      control_kind) +
+                  " tensors for " + model_name);
+        }
+
+        *tensor_name = control_input.name();
+        if (tensor_datatype != nullptr) {
+          *tensor_datatype = c.data_type();
+        }
+
+        seen_control = true;
+
+        if ((c.int32_false_true_size() > 0) || (c.fp32_false_true_size() > 0) ||
+            (c.bool_false_true_size() > 0)) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching must not specify either 'int32_false_true', "
+              "'fp32_false_true' or 'bool_false_true' for " +
+                  inference::ModelSequenceBatching_Control_Kind_Name(
+                      control_kind) +
+                  " for " + model_name);
+        }
+      }
+    }
+  }
+
+  if (!seen_control) {
+    if (required) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "sequence batching control tensor must specify a " +
+              inference::ModelSequenceBatching_Control_Kind_Name(control_kind) +
+              " value for " + model_name);
+    }
+
+    tensor_name->clear();
+  }
+
+  return Status::Success;
+}
+
+Status
+GetNormalizedModelConfig(
+    const std::string& model_name, const std::string& path,
+    const double min_compute_capability, inference::ModelConfig* config)
+{
+  // Server-side autofill only sets certain backend fields for the models that
+  // belong to limited backends for backwards-compatibility. See TensorRT
+  // backend, ONNX Runtime backend, OpenVINO backend, TensorFLow backend, and
+  // PyTorch backend.
+  // Extracting detailed information is delegated to the backend implementation
+  // to auto-complete.
+  RETURN_IF_ERROR(
+      AutoCompleteBackendFields(model_name, std::string(path), config));
+  LOG_VERBOSE(1) << "Server side auto-completed config: "
+                 << config->DebugString();
+
+  RETURN_IF_ERROR(NormalizeModelConfig(min_compute_capability, config));
+
+  return Status::Success;
+}
+
+Status
+NormalizeModelConfig(
+    const double min_compute_capability, inference::ModelConfig* config)
+{
+  // If version_policy is not specified, default to Latest 1 version.
+  if (!config->has_version_policy()) {
+    inference::ModelVersionPolicy::Latest latest;
+    latest.set_num_versions(1);
+    config->mutable_version_policy()->mutable_latest()->CopyFrom(latest);
+  }
+
+  // If dynamic batching is specified...
+  if (config->has_dynamic_batching()) {
+    // If preferred batch size is not specified set it to
+    // max-batch-size.
+    if (config->dynamic_batching().preferred_batch_size().size() == 0) {
+      auto mutable_preferred_batch_size =
+          config->mutable_dynamic_batching()->mutable_preferred_batch_size();
+      if (config->max_batch_size() > 0) {
+        mutable_preferred_batch_size->Add(config->max_batch_size());
+      }
+    }
+  }
+
+  // If sequence batching is specified...
+  if (config->has_sequence_batching()) {
+    // Set default idle is not specified.
+    if (config->sequence_batching().max_sequence_idle_microseconds() == 0) {
+      config->mutable_sequence_batching()->set_max_sequence_idle_microseconds(
+          SEQUENCE_IDLE_DEFAULT_MICROSECONDS);
+    }
+
+    if (config->sequence_batching().has_oldest()) {
+      // If preferred batch size is not specified set it to
+      // max-batch-size.
+      if (config->sequence_batching().oldest().preferred_batch_size().size() ==
+          0) {
+        auto mutable_preferred_batch_size =
+            config->mutable_sequence_batching()
+                ->mutable_oldest()
+                ->mutable_preferred_batch_size();
+        if (config->max_batch_size() > 0) {
+          mutable_preferred_batch_size->Add(config->max_batch_size());
+        }
+      }
+    }
+  }
+
+  // If model ensembling is specified, don't attempt to normalize instance_group
+  // as it is not allowed in ensemble scheduling
+  if (!config->has_ensemble_scheduling()) {
+    auto optimization = config->mutable_optimization();
+    if (!optimization->has_input_pinned_memory()) {
+      optimization->mutable_input_pinned_memory()->set_enable(true);
+    }
+    if (!optimization->has_output_pinned_memory()) {
+      optimization->mutable_output_pinned_memory()->set_enable(true);
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+NormalizeInstanceGroup(
+    const double min_compute_capability,
+    const std::vector<inference::ModelInstanceGroup>& preferred_groups,
+    inference::ModelConfig* config)
+{
+  // Instance group setting doesn't apply to ensemble
+  if (config->has_ensemble_scheduling()) {
+    return Status::Success;
+  }
+
+  // Creates a set of supported GPU device ids
+  std::set<int> supported_gpus;
+#ifdef TRITON_ENABLE_GPU
+  // Get the total number of GPUs from the runtime library.
+  Status status = GetSupportedGPUs(&supported_gpus, min_compute_capability);
+  if (!status.IsOk()) {
+    return status;
+  }
+
+#endif  // TRITON_ENABLE_GPU
+
+  // Make sure there is at least one instance_group.
+  if (config->instance_group().empty()) {
+    inference::ModelInstanceGroup* group = config->add_instance_group();
+    group->set_name(config->name());
+
+    for (const auto& pg : preferred_groups) {
+      group->set_kind(pg.kind());
+      group->set_count(pg.count());
+      // handle preferred GPU setting differently based on kind
+      if (pg.kind() == inference::ModelInstanceGroup::KIND_GPU) {
+        // Don't use preferred group with KIND_GPU if there is no GPU.
+        if (supported_gpus.empty()) {
+          continue;
+        }
+        // If preferred group sets GPUs, limit deployment onto those that
+        // are also listed in supported gpus
+        if (!pg.gpus().empty()) {
+          for (const int32_t gid : pg.gpus()) {
+            if (supported_gpus.find(gid) != supported_gpus.end()) {
+              group->add_gpus(gid);
+            }
+          }
+        }
+        break;
+      } else if (pg.kind() == inference::ModelInstanceGroup::KIND_AUTO) {
+        // if AUTO, then set preferred GPU as is, to align with KIND_AUTO
+        // deduction specified below
+        for (const int32_t gid : pg.gpus()) {
+          group->add_gpus(gid);
+        }
+        break;
+      }
+      // Other kind should not set GPUs
+      break;
+    }
+  }
+
+  // Assign default name, kind and count to each instance group that
+  // doesn't give those values explicitly. For KIND_GPU, set GPUs to
+  // all available if not specified explicitly.
+  size_t cnt = 0;
+  for (auto& group : *config->mutable_instance_group()) {
+    // Name
+    if (group.name().empty()) {
+      group.set_name(config->name() + "_" + std::to_string(cnt));
+    }
+    cnt++;
+
+    // For KIND_AUTO... if there are no GPUs or if any of the listed
+    // 'gpu's are not present, then use KIND_CPU.
+    if (group.kind() == inference::ModelInstanceGroup::KIND_AUTO) {
+      if (supported_gpus.empty()) {
+        group.set_kind(inference::ModelInstanceGroup::KIND_CPU);
+      } else {
+        for (const int32_t gid : group.gpus()) {
+          if (supported_gpus.find(gid) == supported_gpus.end()) {
+            group.set_kind(inference::ModelInstanceGroup::KIND_CPU);
+            break;
+          }
+        }
+      }
+
+      if (group.kind() == inference::ModelInstanceGroup::KIND_AUTO) {
+        group.set_kind(inference::ModelInstanceGroup::KIND_GPU);
+      }
+    }
+
+    // KIND is resolved at this point
+    for (const auto& pg : preferred_groups) {
+      if (group.kind() != pg.kind()) {
+        continue;
+      }
+
+      // Limit the GPU setting within what is specified in the preferred group,
+      // if no available GPU then skip to next preferred group
+      if ((group.kind() == inference::ModelInstanceGroup::KIND_GPU) &&
+          group.gpus().empty() && !pg.gpus().empty()) {
+        for (const int32_t gid : pg.gpus()) {
+          if (supported_gpus.find(gid) != supported_gpus.end()) {
+            group.add_gpus(gid);
+          }
+        }
+        if (group.gpus().empty()) {
+          continue;
+        }
+      }
+      if ((group.count() < 1) && (pg.count() > 0)) {
+        group.set_count(pg.count());
+      }
+    }
+
+    // Set Triton default if the fields are not set from preferred group
+    // Count
+    if (group.count() < 1) {
+      RETURN_IF_ERROR(SetDefaultInstanceCount(&group, config->backend()));
+    }
+
+    // GPUs
+    if ((group.kind() == inference::ModelInstanceGroup::KIND_GPU) &&
+        (group.gpus().size() == 0)) {
+      for (auto d : supported_gpus) {
+        group.add_gpus(d);
+      }
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+LocalizePythonBackendExecutionEnvironmentPath(
+    const std::string& model_path, inference::ModelConfig* config,
+    std::shared_ptr<LocalizedPath>* localized_model_dir)
+{
+  if (config->backend() == "python") {
+    if (config->parameters().contains("EXECUTION_ENV_PATH")) {
+      // Read EXECUTION_ENV_PATH
+      std::string exec_env_path =
+          config->parameters().at("EXECUTION_ENV_PATH").string_value();
+      // Replace model directory variable with model_path
+      std::string model_dir_var = "$$TRITON_MODEL_DIRECTORY";
+      if (exec_env_path.substr(0, model_dir_var.size()) == model_dir_var) {
+        exec_env_path.replace(0, model_dir_var.size(), model_path);
+      }
+      // Collapse any .. in the path
+      std::string abs_exec_env_path;
+      std::size_t prev_pos = exec_env_path.size();
+      std::size_t pos = exec_env_path.find_last_of('/', prev_pos - 1);
+      int skip = 0;
+      while (pos != std::string::npos && prev_pos > 0) {
+        if (!skip) {
+          abs_exec_env_path =
+              exec_env_path.substr(pos, prev_pos - pos) + abs_exec_env_path;
+        }
+        skip = skip > 0 ? skip - 1 : skip;
+        if (pos >= 3 && exec_env_path.substr(pos - 3, 3) == "/..") {
+          skip += 2;
+        }
+        prev_pos = pos;
+        pos = exec_env_path.find_last_of('/', prev_pos - 1);
+      }
+      abs_exec_env_path = exec_env_path.substr(0, prev_pos) + abs_exec_env_path;
+      // Localize iff abs_exec_env_path is outside the model directory
+      std::string model_path_slash =
+          model_path.back() == '/' ? model_path : model_path + "/";
+      if (abs_exec_env_path.substr(0, model_path_slash.size()) !=
+          model_path_slash) {
+        // Localize the file
+        std::shared_ptr<LocalizedPath> localized_exec_env_path;
+        RETURN_IF_ERROR(
+            LocalizePath(abs_exec_env_path, &localized_exec_env_path));
+        // Persist the localized temporary path
+        (*localized_model_dir)
+            ->other_localized_path.push_back(localized_exec_env_path);
+        // Rewrite EXECUTION_ENV_PATH
+        config->mutable_parameters()
+            ->at("EXECUTION_ENV_PATH")
+            .set_string_value(localized_exec_env_path->Path());
+      }
+    }
+  }
+  return Status::Success;
+}
+
+Status
+SetDefaultInstanceCount(
+    inference::ModelInstanceGroup* group, const std::string& backend)
+{
+  group->set_count(1);
+
+  // Backends opt into the default_cpu_instance_count since
+  // some backends (pytorch, OpenVINO) don't perform well/have high overhead
+  // when using multiple instances.
+  const int default_cpu_instance_count = 2;
+  bool use_default_cpu_instance_count =
+      (backend == kTensorFlowBackend) || (backend == kOnnxRuntimeBackend);
+  if (group->kind() == inference::ModelInstanceGroup::KIND_CPU &&
+      use_default_cpu_instance_count) {
+    group->set_count(default_cpu_instance_count);
+  }
+
+  return Status::Success;
+}
+
+Status
+AutoCompleteBackendFields(
+    const std::string& model_name, const std::string& model_path,
+    inference::ModelConfig* config)
+{
+  std::set<std::string> version_dirs;
+  RETURN_IF_ERROR(GetDirectorySubdirs(model_path, &version_dirs));
+
+  // There must be at least one version directory that we can inspect to
+  // attempt to determine the platform. If not, we skip autofill with file name.
+  // For now we allow multiple versions and only inspect the first verison
+  // directory to ensure it is valid. We can add more aggressive checks later.
+  const bool has_version = (version_dirs.size() != 0);
+  const auto version_path =
+      has_version ? JoinPath({model_path, *(version_dirs.begin())}) : "";
+  std::set<std::string> version_dir_content;
+  if (has_version) {
+    RETURN_IF_ERROR(GetDirectoryContents(version_path, &version_dir_content));
+  }
+
+  // If the model name is not given in the configuration, set if based
+  // on the model path.
+  if (config->name().empty()) {
+    config->set_name(model_name);
+  }
+
+  // Trying to fill the 'backend', 'default_model_filename' field.
+
+  // TensorFlow
+  // For TF backend, the platform is required
+  if (config->platform().empty()) {
+    // Check 'backend', 'default_model_filename', and the actual directory
+    // to determine the platform
+    if (config->backend().empty() ||
+        (config->backend() == kTensorFlowBackend)) {
+      if (config->default_model_filename() == kTensorFlowSavedModelFilename) {
+        config->set_platform(kTensorFlowSavedModelPlatform);
+      } else if (
+          config->default_model_filename() == kTensorFlowGraphDefFilename) {
+        config->set_platform(kTensorFlowGraphDefPlatform);
+      } else if (config->default_model_filename().empty() && has_version) {
+        bool is_dir = false;
+        if (version_dir_content.find(kTensorFlowSavedModelFilename) !=
+            version_dir_content.end()) {
+          RETURN_IF_ERROR(IsDirectory(
+              JoinPath({version_path, kTensorFlowSavedModelFilename}),
+              &is_dir));
+          if (is_dir) {
+            config->set_platform(kTensorFlowSavedModelPlatform);
+          }
+        }
+        if (version_dir_content.find(kTensorFlowGraphDefFilename) !=
+            version_dir_content.end()) {
+          RETURN_IF_ERROR(IsDirectory(
+              JoinPath({version_path, kTensorFlowGraphDefFilename}), &is_dir));
+          if (!is_dir) {
+            config->set_platform(kTensorFlowGraphDefPlatform);
+          }
+        }
+      }
+    }
+  }
+
+  // Fill 'backend' and 'default_model_filename' if missing
+  if ((config->platform() == kTensorFlowSavedModelPlatform) ||
+      (config->platform() == kTensorFlowGraphDefPlatform)) {
+    if (config->backend().empty()) {
+      config->set_backend(kTensorFlowBackend);
+    }
+    if (config->default_model_filename().empty()) {
+      if (config->platform() == kTensorFlowSavedModelPlatform) {
+        config->set_default_model_filename(kTensorFlowSavedModelFilename);
+      } else {
+        config->set_default_model_filename(kTensorFlowGraphDefFilename);
+      }
+    }
+    return Status::Success;
+  }
+
+  // TensorRT
+  if (config->backend().empty()) {
+    if ((config->platform() == kTensorRTPlanPlatform) ||
+        (config->default_model_filename() == kTensorRTPlanFilename)) {
+      config->set_backend(kTensorRTBackend);
+    } else if (
+        config->platform().empty() &&
+        config->default_model_filename().empty() && has_version) {
+      bool is_dir = false;
+      if (version_dir_content.find(kTensorRTPlanFilename) !=
+          version_dir_content.end()) {
+        RETURN_IF_ERROR(IsDirectory(
+            JoinPath({version_path, kTensorRTPlanFilename}), &is_dir));
+        if (!is_dir) {
+          config->set_backend(kTensorRTBackend);
+        }
+      }
+    }
+  }
+  if (config->backend() == kTensorRTBackend) {
+    if (config->platform().empty()) {
+      config->set_platform(kTensorRTPlanPlatform);
+    }
+    if (config->default_model_filename().empty()) {
+      config->set_default_model_filename(kTensorRTPlanFilename);
+    }
+    return Status::Success;
+  }
+
+  // ONNXRuntime
+  if (config->backend().empty()) {
+    if ((config->platform() == kOnnxRuntimeOnnxPlatform) ||
+        (config->default_model_filename() == kOnnxRuntimeOnnxFilename)) {
+      config->set_backend(kOnnxRuntimeBackend);
+    } else if (
+        config->platform().empty() &&
+        config->default_model_filename().empty() && has_version) {
+      if (version_dir_content.find(kOnnxRuntimeOnnxFilename) !=
+          version_dir_content.end()) {
+        // ONNX model can be a file or a directory in the case of large model
+        config->set_backend(kOnnxRuntimeBackend);
+      }
+    }
+  }
+  if (config->backend() == kOnnxRuntimeBackend) {
+    if (config->platform().empty()) {
+      config->set_platform(kOnnxRuntimeOnnxPlatform);
+    }
+    if (config->default_model_filename().empty()) {
+      config->set_default_model_filename(kOnnxRuntimeOnnxFilename);
+    }
+    return Status::Success;
+  }
+
+  // OpenVINO
+  if (config->backend().empty()) {
+    if (config->default_model_filename() == kOpenVINORuntimeOpenVINOFilename) {
+      config->set_backend(kOpenVINORuntimeBackend);
+    } else if (
+        config->platform().empty() &&
+        config->default_model_filename().empty() && has_version) {
+      if (version_dir_content.find(kOpenVINORuntimeOpenVINOFilename) !=
+          version_dir_content.end()) {
+        config->set_backend(kOpenVINORuntimeBackend);
+      }
+    }
+  }
+  if (config->backend() == kOpenVINORuntimeBackend) {
+    if (config->default_model_filename().empty()) {
+      config->set_default_model_filename(kOpenVINORuntimeOpenVINOFilename);
+    }
+    return Status::Success;
+  }
+
+  // PyTorch (TorchScript, LibTorch)
+  if (config->backend().empty()) {
+    if ((config->platform() == kPyTorchLibTorchPlatform) ||
+        (config->default_model_filename() == kPyTorchLibTorchFilename)) {
+      config->set_backend(kPyTorchBackend);
+    } else if (
+        config->platform().empty() &&
+        config->default_model_filename().empty() && has_version) {
+      bool is_dir = false;
+      if (version_dir_content.find(kPyTorchLibTorchFilename) !=
+          version_dir_content.end()) {
+        RETURN_IF_ERROR(IsDirectory(
+            JoinPath({version_path, kPyTorchLibTorchFilename}), &is_dir));
+        if (!is_dir) {
+          config->set_backend(kPyTorchBackend);
+        }
+      }
+    }
+  }
+  if (config->backend() == kPyTorchBackend) {
+    if (config->platform().empty()) {
+      config->set_platform(kPyTorchLibTorchPlatform);
+    }
+    if (config->default_model_filename().empty()) {
+      config->set_default_model_filename(kPyTorchLibTorchFilename);
+    }
+    return Status::Success;
+  }
+
+  // Python
+  if (config->backend().empty()) {
+    if (config->default_model_filename() == kPythonFilename) {
+      config->set_backend(kPythonBackend);
+    } else if (
+        config->platform().empty() &&
+        config->default_model_filename().empty() && has_version) {
+      if (version_dir_content.find(kPythonFilename) !=
+          version_dir_content.end()) {
+        config->set_backend(kPythonBackend);
+      }
+    }
+  }
+  if (config->backend() == kPythonBackend) {
+    if (config->default_model_filename().empty()) {
+      config->set_default_model_filename(kPythonFilename);
+    }
+    return Status::Success;
+  }
+
+  // Custom Backend
+  // For now, only do the narrowest case, where no info is given in the config.
+  if (config->backend().empty() && config->platform().empty() &&
+      config->default_model_filename().empty()) {
+    LOG_VERBOSE(1) << "Could not infer supported backend, so attempting "
+                      "autofill of custom backend.";
+    // Since we lazily load the backends, we let the model tell us what backend
+    // to load. We must assume that if the model name conforms to the required
+    // shape, we parse the backend name out of the model file name. i.e.
+    // model.identity will set the backend to "identity".
+    const std::string delimiter = ".";
+    size_t pos = model_name.find(delimiter, 0);
+    if (pos == std::string::npos) {
+      return Status(
+          triton::common::Error::Code::INVALID_ARG,
+          ("Invalid model name: Could not determine backend for model '" +
+           model_name +
+           "' with no backend in model configuration. Expected model name of "
+           "the form 'model.<backend_name>'."));
+    }
+    const std::string backend_name =
+        model_name.substr(pos + 1, std::string::npos);
+    config->set_backend(backend_name);
+    config->set_default_model_filename(
+        (std::string("model.") + backend_name).c_str());
+    return Status::Success;
+  }
+
+  return Status::Success;
+}
+
+Status
+ValidateModelIOConfig(const inference::ModelConfig& config)
+{
+  Status status;
+  for (const auto& io : config.input()) {
+    status = ValidateModelInput(io, config.max_batch_size(), config.platform());
+    if (!status.IsOk()) {
+      return Status(
+          status.StatusCode(), status.Message() + " for " + config.name());
+    }
+  }
+  for (const auto& io : config.output()) {
+    status =
+        ValidateModelOutput(io, config.max_batch_size(), config.platform());
+    if (!status.IsOk()) {
+      return Status(
+          status.StatusCode(), status.Message() + " for " + config.name());
+    }
+  }
+  status = ValidateBatchIO(config);
+  if (!status.IsOk()) {
+    return Status(
+        status.StatusCode(), status.Message() + " for " + config.name());
+  }
+  return Status::Success;
+}
+
+Status
+ValidateBatchIO(const inference::ModelConfig& config)
+{
+  std::set<std::string> input_names;
+  std::set<std::string> output_names;
+  for (const auto& io : config.input()) {
+    input_names.emplace(io.name());
+  }
+  for (const auto& io : config.output()) {
+    output_names.emplace(io.name());
+  }
+  for (const auto& batch_io : config.batch_input()) {
+    switch (batch_io.kind()) {
+      case inference::BatchInput::BATCH_ELEMENT_COUNT:
+      case inference::BatchInput::BATCH_ACCUMULATED_ELEMENT_COUNT:
+      case inference::BatchInput::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO:
+      case inference::BatchInput::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE:
+      case inference::BatchInput::BATCH_ITEM_SHAPE:
+      case inference::BatchInput::BATCH_ITEM_SHAPE_FLATTEN: {
+        if (batch_io.source_input_size() != 1) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "batch input kind '" +
+                  inference::BatchInput::Kind_Name(batch_io.kind()) +
+                  "' expects 1 source input, got " +
+                  std::to_string(batch_io.source_input_size()));
+        }
+        break;
+      }
+      default:
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unknown batch input kind '" +
+                inference::BatchInput::Kind_Name(batch_io.kind()) + "'");
+    }
+    if ((batch_io.data_type() != inference::DataType::TYPE_INT32) &&
+        (batch_io.data_type() != inference::DataType::TYPE_FP32)) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "batch input data type must be TYPE_INT32 or TYPE_FP32");
+    }
+    for (const auto& source_name : batch_io.source_input()) {
+      if (input_names.find(source_name) == input_names.end()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unknown source input name '" + source_name + "'");
+      }
+    }
+  }
+
+  for (const auto& batch_io : config.batch_output()) {
+    switch (batch_io.kind()) {
+      case inference::BatchOutput::BATCH_SCATTER_WITH_INPUT_SHAPE: {
+        if (batch_io.source_input_size() != 1) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "batch output kind '" +
+                  inference::BatchOutput::Kind_Name(batch_io.kind()) +
+                  "' expects 1 source input, got " +
+                  std::to_string(batch_io.source_input_size()));
+        }
+        break;
+      }
+      default:
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unknown batch output kind '" +
+                inference::BatchOutput::Kind_Name(batch_io.kind()) + "'");
+    }
+    for (const auto& source_name : batch_io.source_input()) {
+      if (input_names.find(source_name) == input_names.end()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unknown source input name '" + source_name + "'");
+      }
+    }
+    std::set<std::string> target_names;
+    for (const auto& target_name : batch_io.target_name()) {
+      if (output_names.find(target_name) == output_names.end()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unknown target output name '" + target_name + "'");
+      }
+      if (target_names.emplace(target_name).second == false) {
+        return Status(
+            Status::Code::INVALID_ARG, "target output name '" + target_name +
+                                           "' can only be specified once");
+      }
+    }
+  }
+  return Status::Success;
+}
+
+Status
+ValidateModelConfig(
+    const inference::ModelConfig& config, const double min_compute_capability)
+{
+  if (config.name().empty()) {
+    return Status(
+        Status::Code::INVALID_ARG, "model configuration must specify 'name'");
+  }
+
+  if (config.backend().empty()) {
+    // Expect backend is not empty unless it is ensemble platform.
+#ifdef TRITON_ENABLE_ENSEMBLE
+    if (config.platform() != kEnsemblePlatform)
+#endif  // TRITON_ENABLE_ENSEMBLE
+      return Status(
+          Status::Code::INVALID_ARG, "unexpected platform type '" +
+                                         config.platform() + "' for " +
+                                         config.name());
+  }
+#ifdef TRITON_ENABLE_ENSEMBLE
+  else if (config.platform() == kEnsemblePlatform) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Ensemble model '" + config.name() + "' must have platform type '" +
+            config.platform() + "' and empty backend type");
+  }
+#endif  // TRITON_ENABLE_ENSEMBLE
+
+  if (config.platform().empty() && config.backend().empty()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify 'platform' or 'backend' for '" + config.name() + "'");
+  }
+
+  // Ensure both platform and backend are referring to known backend,
+  // or both referring to unknown backend for user-provided backend.
+  if (GetBackendTypeFromPlatform(config.platform()) !=
+      GetBackendType(config.backend())) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "unexpected 'platform' and 'backend' pair, got:" + config.platform() +
+            ", " + config.backend());
+  }
+
+  if (config.max_batch_size() < 0) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "'max_batch_size' must be non-negative value for " + config.name());
+  }
+
+  if (!config.has_version_policy()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify 'version policy' for " + config.name());
+  }
+
+  // If dynamic batching is specified make sure the preferred batch
+  // sizes are positive and don't exceed maximum batch size.
+  if (config.has_dynamic_batching()) {
+    for (const auto size : config.dynamic_batching().preferred_batch_size()) {
+      if (size <= 0) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "dynamic batching preferred size must be positive for " +
+                config.name());
+      }
+      if (size > config.max_batch_size()) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "dynamic batching preferred size must be <= max batch size for " +
+                config.name());
+      }
+    }
+
+    // Priority queue is specified
+    const auto priority_levels = config.dynamic_batching().priority_levels();
+    if (priority_levels != 0) {
+      if ((config.dynamic_batching().default_priority_level() == 0) ||
+          (config.dynamic_batching().default_priority_level() >
+           priority_levels)) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "default priority level must be in range [1, " +
+                std::to_string(priority_levels) + "] for " + config.name());
+      }
+      for (const auto& queue_policy :
+           config.dynamic_batching().priority_queue_policy()) {
+        if ((queue_policy.first == 0) ||
+            (queue_policy.first > priority_levels)) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "priority queue policy must have priority level in range [1, " +
+                  std::to_string(priority_levels) + "] for " + config.name());
+        }
+      }
+    }
+
+    // preserve ordering option will conflict with priorities and delay policy
+    if (config.dynamic_batching().preserve_ordering()) {
+      if (priority_levels > 1) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "Only one priority level is allowed when 'preserve_ordering' is "
+            "true for " +
+                config.name());
+      }
+      const auto& default_policy =
+          config.dynamic_batching().default_queue_policy();
+      if ((default_policy.default_timeout_microseconds() != 0) &&
+          (default_policy.timeout_action() ==
+           inference::ModelQueuePolicy::DELAY)) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "Queue policy can not have DELAY as timeout action when "
+            "'preserve_ordering' is true for " +
+                config.name());
+      }
+      // Also need to check policy in 'priority_queue_policy'
+      // for single priority case
+      for (const auto& policy :
+           config.dynamic_batching().priority_queue_policy()) {
+        if ((policy.second.default_timeout_microseconds() != 0) &&
+            (policy.second.timeout_action() ==
+             inference::ModelQueuePolicy::DELAY)) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "Queue policy can not have DELAY as timeout action when "
+              "'preserve_ordering' is true for " +
+                  config.name());
+        }
+      }
+    }
+  }
+
+  // If sequence batching is specified make sure the control is
+  // specified correctly.
+  if (config.has_sequence_batching()) {
+    const auto& batcher = config.sequence_batching();
+
+    // Check boolean controls...
+    std::string tensor_name;
+    RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+        batcher, config.name(),
+        inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_START,
+        false /* required */, &tensor_name, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr));
+    RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+        batcher, config.name(),
+        inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_END,
+        false /* required */, &tensor_name, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr));
+    RETURN_IF_ERROR(GetBooleanSequenceControlProperties(
+        batcher, config.name(),
+        inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_READY,
+        false /* required */, &tensor_name, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr));
+
+    // Check CORRID control and make sure it is one of the allowed types.
+    inference::DataType tensor_datatype;
+    RETURN_IF_ERROR(GetTypedSequenceControlProperties(
+        batcher, config.name(),
+        inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_CORRID,
+        false /* required */, &tensor_name, &tensor_datatype));
+    if (!tensor_name.empty()) {
+      if ((tensor_datatype != inference::DataType::TYPE_UINT64) &&
+          (tensor_datatype != inference::DataType::TYPE_INT64) &&
+          (tensor_datatype != inference::DataType::TYPE_UINT32) &&
+          (tensor_datatype != inference::DataType::TYPE_INT32) &&
+          (tensor_datatype != inference::DataType::TYPE_STRING)) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "unexpected data type for control " +
+                inference::ModelSequenceBatching_Control_Kind_Name(
+                    inference::ModelSequenceBatching::Control::
+                        CONTROL_SEQUENCE_CORRID) +
+                " for " + config.name() +
+                ". Allowed data types are TYPE_UINT64, TYPE_INT64, "
+                "TYPE_UINT32, "
+                "TYPE_INT32 and TYPE_STRING");
+      }
+    }
+
+    // If oldest-first strategy is enabled make sure the preferred
+    // batch sizes are positive and don't exceed maximum batch size.
+    if (config.sequence_batching().has_oldest()) {
+      for (const auto size :
+           config.sequence_batching().oldest().preferred_batch_size()) {
+        if (size <= 0) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching preferred batch size must be positive for " +
+                  config.name());
+        }
+        if (size > config.max_batch_size()) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "sequence batching preferred batch size must be <= max batch "
+              "size for " +
+                  config.name());
+        }
+      }
+    }
+
+    // If direct strategy is enabled make sure the minimum slot utilization is
+    // in range (0.0, 1.0]
+    if (config.sequence_batching().has_direct()) {
+      if ((config.sequence_batching().direct().minimum_slot_utilization() <
+           0.0) ||
+          (config.sequence_batching().direct().minimum_slot_utilization() >
+           1.0)) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "sequence batching minimum slot utilization must be in range "
+            "(0.0, 1.0] for " +
+                config.name());
+      }
+    }
+  }
+
+  // If ensemble scheduling is specified, validate it.  Otherwise,
+  // must validate platform and instance_group
+  if (config.has_ensemble_scheduling()) {
+#ifdef TRITON_ENABLE_ENSEMBLE
+    RETURN_IF_ERROR(ValidateEnsembleSchedulingConfig(config));
+#else
+    return Status(
+        Status::Code::INVALID_ARG, "ensemble scheduling not supported");
+#endif  // TRITON_ENABLE_ENSEMBLE
+  }
+#ifdef TRITON_ENABLE_ENSEMBLE
+  else if (config.platform() == kEnsemblePlatform) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "ensemble scheduling must be set for ensemble " + config.name() +
+            " whose platform is " + kEnsemblePlatform);
+  }
+#endif  // TRITON_ENABLE_ENSEMBLE
+
+  // FIXME: DLIS-3916 - Response Cache does not yet support decoupled models
+  if (config.model_transaction_policy().decoupled() &&
+      config.response_cache().enable()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "Response Cache does not currently support model " + config.name() +
+            " with 'decoupled' transaction policy. Please disable the response"
+            " cache.");
+  }
+
+  return Status::Success;
+}
+
+Status
+ValidateInstanceGroup(
+    const inference::ModelConfig& config, const double min_compute_capability)
+{
+  // Instance group setting doesn't apply to ensemble
+  if (config.has_ensemble_scheduling()) {
+    return Status::Success;
+  }
+
+  if (config.instance_group().size() == 0) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "must specify one or more 'instance group's for " + config.name());
+  }
+
+  // Make sure KIND_GPU instance group specifies at least one GPU and
+  // doesn't specify a non-existent GPU. Make sure non-KIND_GPU does
+  // not specify any GPUs.
+#ifdef TRITON_ENABLE_GPU
+  std::set<int> supported_gpus;
+  Status status = GetSupportedGPUs(&supported_gpus, min_compute_capability);
+  if (!status.IsOk()) {
+    return status;
+  }
+#endif  // TRITON_ENABLE_GPU
+
+  for (const auto& group : config.instance_group()) {
+    if (group.kind() == inference::ModelInstanceGroup::KIND_MODEL) {
+      if (group.gpus().size() > 0) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "instance group " + group.name() + " of model " + config.name() +
+                " has kind KIND_MODEL but specifies one or more GPUs");
+      }
+    } else if (group.kind() == inference::ModelInstanceGroup::KIND_GPU) {
+#if !defined(TRITON_ENABLE_GPU) && !defined(TRITON_ENABLE_MALI_GPU)
+      return Status(
+          Status::Code::INVALID_ARG,
+          "instance group " + group.name() + " of model " + config.name() +
+              " has kind KIND_GPU but server does not support GPUs");
+#elif defined(TRITON_ENABLE_GPU)
+      if (group.gpus().size() == 0) {
+        if (supported_gpus.size() == 0) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "instance group " + group.name() + " of model " + config.name() +
+                  " has kind KIND_GPU but no GPUs are available");
+        } else {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "instance group " + group.name() + " of model " + config.name() +
+                  " has kind KIND_GPU but specifies no GPUs");
+        }
+      }
+
+      for (const int32_t gid : group.gpus()) {
+        if (supported_gpus.find(gid) == supported_gpus.end()) {
+          std::string supported_gpus_str;
+          for (const auto& cc : supported_gpus) {
+            if (!supported_gpus_str.empty()) {
+              supported_gpus_str += ", ";
+            }
+            supported_gpus_str += std::to_string(cc);
+          }
+          return Status(
+              Status::Code::INVALID_ARG,
+              "instance group " + group.name() + " of model " + config.name() +
+                  " specifies invalid or unsupported gpu id " +
+                  std::to_string(gid) +
+                  ". GPUs with at least the minimum required CUDA compute "
+                  "compatibility of " +
+                  std::to_string(min_compute_capability) +
+                  " are: " + supported_gpus_str);
+        }
+      }
+#endif  // ! TRITON_ENABLE_GPU && ! TRITON_ENABLE_MALI_GPU
+    } else if (group.kind() == inference::ModelInstanceGroup::KIND_CPU) {
+      if (group.gpus().size() > 0) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            "instance group " + group.name() + " of model " + config.name() +
+                " has kind KIND_CPU but specifies one or more GPUs");
+      }
+    } else {
+      return Status(
+          Status::Code::INTERNAL, "instance group " + group.name() +
+                                      " of model " + config.name() +
+                                      " has unexpected kind KIND_AUTO");
+    }
+
+    if ((config.platform() != kTensorRTPlanPlatform) &&
+        !group.profile().empty()) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          "instance group " + group.name() + " of model " + config.name() +
+              " and platform " + config.platform() +
+              "specifies profile field which is only supported for "
+              "TensorRT models");
+    } else if (!group.profile().empty()) {
+      for (const auto& profile : group.profile()) {
+        int profile_index;
+        RETURN_IF_ERROR(GetProfileIndex(profile, &profile_index));
+        if (profile_index < 0) {
+          return Status(
+              Status::Code::INVALID_ARG,
+              "instance group " + group.name() + " of model " + config.name() +
+                  " and platform " + config.platform() +
+                  " specifies invalid profile " + profile +
+                  ". The field should contain the string representation of a "
+                  "non-negative integer.");
+        }
+      }
+    }
+  }
+  return Status::Success;
+}
+
+Status
+ValidateModelInput(
+    const inference::ModelInput& io, int32_t max_batch_size,
+    const std::string& platform)
+{
+  RETURN_IF_ERROR(ValidateIOShape(io, max_batch_size, "model input "));
+
+  if (((io.format() == inference::ModelInput::FORMAT_NHWC) ||
+       (io.format() == inference::ModelInput::FORMAT_NCHW)) &&
+      (io.dims_size() != 3)) {
+    return Status(
+        Status::Code::INVALID_ARG, "model input NHWC/NCHW require 3 dims");
+  }
+
+  if ((platform != kTensorRTPlanPlatform) && io.is_shape_tensor()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "shape tensors are only supported for TensorRT platform");
+  }
+
+  return Status::Success;
+}
+
+Status
+CheckAllowedModelInput(
+    const inference::ModelInput& io, const std::set<std::string>& allowed)
+{
+  if (allowed.find(io.name()) == allowed.end()) {
+    std::string astr;
+    for (const auto& a : allowed) {
+      if (!astr.empty()) {
+        astr.append(", ");
+      }
+      astr.append(a);
+    }
+
+    return Status(
+        Status::Code::INVALID_ARG, "unexpected inference input '" + io.name() +
+                                       "', allowed inputs are: " + astr);
+  }
+  return Status::Success;
+}
+
+Status
+ValidateModelOutput(
+    const inference::ModelOutput& io, int32_t max_batch_size,
+    const std::string& platform)
+{
+  RETURN_IF_ERROR(ValidateIOShape(io, max_batch_size, "model output "));
+
+  if ((platform != kTensorRTPlanPlatform) && io.is_shape_tensor()) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "shape tensors are only supported for TensorRT platform");
+  }
+
+  return Status::Success;
+}
+
+Status
+CheckAllowedModelOutput(
+    const inference::ModelOutput& io, const std::set<std::string>& allowed)
+{
+  if (allowed.find(io.name()) == allowed.end()) {
+    std::string astr;
+    for (const auto& a : allowed) {
+      if (!astr.empty()) {
+        astr.append(", ");
+      }
+      astr.append(a);
+    }
+
+    return Status(
+        Status::Code::INVALID_ARG, "unexpected inference output '" + io.name() +
+                                       "', allowed outputs are: " + astr);
+  }
+
+  return Status::Success;
+}
+
+Status
+ParseBoolParameter(
+    const std::string& key, std::string value, bool* parsed_value)
+{
+  std::transform(
+      value.begin(), value.end(), value.begin(),
+      [](unsigned char c) { return std::tolower(c); });
+
+  if ((value == "true") || (value == "1")) {
+    *parsed_value = true;
+  } else if ((value == "false") || (value == "0")) {
+    *parsed_value = false;
+  } else {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "failed to convert " + key + " '" + value + "' to boolean value");
+  }
+
+  return Status::Success;
+}
+
+Status
+ParseLongLongParameter(
+    const std::string& key, const std::string& value, int64_t* parsed_value)
+{
+  try {
+    *parsed_value = std::stoll(value);
+  }
+  catch (const std::invalid_argument& ia) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "failed to convert " + key + " '" + value + "' to integral number");
+  }
+
+  return Status::Success;
+}
+
+Status
+GetProfileIndex(const std::string& profile_name, int* profile_index)
+{
+  if (profile_name.empty()) {
+    return Status(Status::Code::INVALID_ARG, "profile name must not be empty");
+  }
+
+  try {
+    *profile_index = stoi(profile_name);
+  }
+  catch (const std::invalid_argument& ia) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        "unable to parse '" + profile_name + "': " + ia.what());
+  }
+
+  return Status::Success;
+}
+
+namespace {
+
+Status
+CollectInt64Fields(
+    google::protobuf::Message* message, const std::string& prefix,
+    std::set<std::string>* int64_fields)
+{
+  const google::protobuf::Descriptor* desc = message->GetDescriptor();
+  const google::protobuf::Reflection* refl = message->GetReflection();
+  for (int i = 0; i < desc->field_count(); ++i) {
+    const google::protobuf::FieldDescriptor* field = desc->field(i);
+    const std::string fullname = prefix + "::" + field->name();
+    switch (field->type()) {
+      case google::protobuf::FieldDescriptor::TYPE_MESSAGE: {
+        if (field->is_repeated()) {
+          int rsize = refl->FieldSize(*message, field);
+          if (rsize == 0) {
+            refl->AddMessage(message, field);
+          }
+
+          rsize = refl->FieldSize(*message, field);
+          for (int r = 0; r < rsize; ++r) {
+            RETURN_IF_ERROR(CollectInt64Fields(
+                refl->MutableRepeatedMessage(message, field, r), fullname,
+                int64_fields));
+          }
+        } else {
+          RETURN_IF_ERROR(CollectInt64Fields(
+              refl->MutableMessage(message, field), fullname, int64_fields));
+        }
+      } break;
+
+      case google::protobuf::FieldDescriptor::TYPE_INT64:
+      case google::protobuf::FieldDescriptor::TYPE_UINT64:
+      case google::protobuf::FieldDescriptor::TYPE_SINT64:
+      case google::protobuf::FieldDescriptor::TYPE_FIXED64:
+      case google::protobuf::FieldDescriptor::TYPE_SFIXED64:
+        int64_fields->insert(fullname);
+        break;
+
+      default:
+        break;
+    }
+  }
+
+  return Status::Success;
+}
+
+Status
+ValidateModelConfigInt64()
+{
+  // Must initialize a dummy ModelConfig so that all fields are
+  // visited.
+  inference::ModelConfig config;
+
+  std::set<std::string> int64_fields;
+  RETURN_IF_ERROR(CollectInt64Fields(&config, "ModelConfig", &int64_fields));
+
+  LOG_VERBOSE(1) << "ModelConfig 64-bit fields:";
+  for (const auto& f : int64_fields) {
+    LOG_VERBOSE(1) << "\t" << f;
+  }
+
+  // We expect to find exactly the following fields. If we get an
+  // error from this code ModelConfig has added or removed a 64-bit
+  // field and we need to adjust here and in ModelConfigToJson below.
+  std::set<std::string> expected{
+      "ModelConfig::input::dims",
+      "ModelConfig::input::reshape::shape",
+      "ModelConfig::output::dims",
+      "ModelConfig::output::reshape::shape",
+      "ModelConfig::version_policy::specific::versions",
+      "ModelConfig::dynamic_batching::max_queue_delay_microseconds",
+      "ModelConfig::dynamic_batching::default_queue_policy::default_timeout_"
+      "microseconds",
+      "ModelConfig::dynamic_batching::priority_queue_policy::value::default_"
+      "timeout_microseconds",
+      "ModelConfig::sequence_batching::direct::max_queue_delay_microseconds",
+      "ModelConfig::sequence_batching::state::dims",
+      "ModelConfig::sequence_batching::state::initial_state::dims",
+      "ModelConfig::sequence_batching::oldest::max_queue_delay_microseconds",
+      "ModelConfig::sequence_batching::max_sequence_idle_microseconds",
+      "ModelConfig::ensemble_scheduling::step::model_version",
+      "ModelConfig::model_warmup::inputs::value::dims",
+      "ModelConfig::optimization::cuda::graph_spec::input::value::dim",
+      "ModelConfig::optimization::cuda::graph_spec::graph_lower_bound::input::"
+      "value::dim",
+      "ModelConfig::instance_group::secondary_devices::device_id"};
+
+  if (int64_fields != expected) {
+    return Status(
+        Status::Code::INTERNAL, "ModelConfig 64-bit field needs update");
+  }
+
+  return Status::Success;
+}
+
+Status
+FixInt(
+    triton::common::TritonJson::Value& document,
+    triton::common::TritonJson::Value& io, const std::string& name)
+{
+  triton::common::TritonJson::Value str_value;
+  if (!io.Find(name.c_str(), &str_value)) {
+    return Status::Success;
+  }
+
+  std::string str;
+  RETURN_IF_ERROR(str_value.AsString(&str));
+
+  int64_t d;
+  try {
+    d = std::atoll(str.c_str());
+  }
+  catch (...) {
+    return Status(
+        Status::Code::INTERNAL,
+        (std::string("unable to convert '") + str + "' to integer"));
+  }
+
+  str_value.SetInt(d);
+
+  return Status::Success;
+}
+
+Status
+FixIntArray(
+    triton::common::TritonJson::Value& document,
+    triton::common::TritonJson::Value& io, const std::string& name)
+{
+  triton::common::TritonJson::Value fixed_shape_array(
+      document, triton::common::TritonJson::ValueType::ARRAY);
+
+  if (!io.Find(name.c_str())) {
+    return Status::Success;
+  }
+
+  triton::common::TritonJson::Value shape_array;
+  RETURN_IF_ERROR(io.MemberAsArray(name.c_str(), &shape_array));
+  for (size_t i = 0; i < shape_array.ArraySize(); ++i) {
+    std::string str;
+    RETURN_IF_ERROR(shape_array.IndexAsString(i, &str));
+
+    int64_t d;
+    try {
+      d = std::atoll(str.c_str());
+    }
+    catch (...) {
+      return Status(
+          Status::Code::INTERNAL,
+          (std::string("unable to convert '") + str + "' to integer"));
+    }
+
+    RETURN_IF_ERROR(fixed_shape_array.AppendInt(d));
+  }
+
+  shape_array.Swap(fixed_shape_array);
+  fixed_shape_array.Release();
+
+  return Status::Success;
+}
+
+Status
+FixObjectArray(
+    triton::common::TritonJson::Value& document,
+    triton::common::TritonJson::Value& arr, const std::string& name)
+{
+  for (size_t i = 0; i < arr.ArraySize(); ++i) {
+    triton::common::TritonJson::Value obj;
+    RETURN_IF_ERROR(arr.IndexAsObject(i, &obj));
+    RETURN_IF_ERROR(FixInt(document, obj, name));
+  }
+
+  return Status::Success;
+}
+
+}  // namespace
+
+Status
+ModelConfigToJson(
+    const inference::ModelConfig& config, const uint32_t config_version,
+    std::string* json_str)
+{
+  // Currently only support 'config_version' 1, which is the json
+  // representation of the ModelConfig protobuf with the int64 fields
+  // fixes to be actual numbers instead of the string madness done by
+  // protobuf.
+  if (config_version != 1) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        std::string("model configuration version ") +
+            std::to_string(config_version) +
+            " not supported, supported versions are: 1");
+  }
+
+  // Config will have 0 byte size if all fields are with default value,
+  // in other word the config is empty.
+  if (config.ByteSizeLong() == 0) {
+    json_str->clear();
+    return Status::Success;
+  }
+
+  std::string config_json_str;
+  ::google::protobuf::util::JsonPrintOptions options;
+  options.preserve_proto_field_names = true;
+  options.always_print_primitive_fields = true;
+  ::google::protobuf::util::MessageToJsonString(
+      config, &config_json_str, options);
+
+  // We need to verify that every field 64-bit field in the
+  // ModelConfig protobuf is being handled. We hardcode the known
+  // fields and check just once to make sure everything has been
+  // handled. We could have this check in a separately compiled CI
+  // test but it is convenient to keep it here close to the code below
+  // that actually fixes the 64-bit fields.
+  {
+    static std::once_flag fonce;
+    Status status = Status::Success;
+    std::call_once(fonce, [&status] { status = ValidateModelConfigInt64(); });
+    RETURN_IF_ERROR(status);
+  }
+
+  // In the json produced by protobuf, int64 and uint64 values are
+  // represented as strings. Protobuf doesn't provide an option to
+  // disable this (sigh) so we need to fix it up here as we want the
+  // json representation of the config to be reasonable json...
+  triton::common::TritonJson::Value config_json;
+  config_json.Parse(config_json_str);
+
+  // Fix input::dims, input::reshape::shape, output::dims,
+  // output::reshape::shape
+  for (std::string name : {"input", "output"}) {
+    triton::common::TritonJson::Value ios;
+    RETURN_IF_ERROR(config_json.MemberAsArray(name.c_str(), &ios));
+    for (size_t i = 0; i < ios.ArraySize(); ++i) {
+      triton::common::TritonJson::Value io;
+      RETURN_IF_ERROR(ios.IndexAsObject(i, &io));
+      RETURN_IF_ERROR(FixIntArray(config_json, io, "dims"));
+
+      triton::common::TritonJson::Value reshape;
+      if (io.Find("reshape", &reshape)) {
+        RETURN_IF_ERROR(FixIntArray(config_json, reshape, "shape"));
+      }
+    }
+  }
+
+  // Fix version_policy::specific::versions
+  {
+    triton::common::TritonJson::Value vp;
+    if (config_json.Find("version_policy", &vp)) {
+      triton::common::TritonJson::Value specific;
+      if (vp.Find("specific", &specific)) {
+        RETURN_IF_ERROR(FixIntArray(config_json, specific, "versions"));
+      }
+    }
+  }
+
+  // Fix dynamic_batching::max_queue_delay_microseconds,
+  // dynamic_batching::default_queue_policy::default_timeout_microseconds,
+  // dynamic_batching::priority_queue_policy::value::default_timeout_microseconds
+  {
+    triton::common::TritonJson::Value db;
+    if (config_json.Find("dynamic_batching", &db)) {
+      RETURN_IF_ERROR(FixInt(config_json, db, "max_queue_delay_microseconds"));
+      triton::common::TritonJson::Value dqp;
+      if (db.Find("default_queue_policy", &dqp)) {
+        RETURN_IF_ERROR(
+            FixInt(config_json, dqp, "default_timeout_microseconds"));
+      }
+      triton::common::TritonJson::Value pqp;
+      if (db.Find("priority_queue_policy", &pqp)) {
+        // Iterate over each member in 'pqp' and fix...
+        std::vector<std::string> members;
+        RETURN_IF_ERROR(pqp.Members(&members));
+        for (const auto& m : members) {
+          triton::common::TritonJson::Value el;
+          RETURN_IF_ERROR(pqp.MemberAsObject(m.c_str(), &el));
+          RETURN_IF_ERROR(
+              FixInt(config_json, el, "default_timeout_microseconds"));
+        }
+      }
+    }
+  }
+
+  // Fix sequence_batching::oldest::max_queue_delay_microseconds,
+  // sequence_batching::direct::max_queue_delay_microseconds,
+  // sequence_batching::max_sequence_idle_microseconds
+  {
+    triton::common::TritonJson::Value sb;
+    if (config_json.Find("sequence_batching", &sb)) {
+      RETURN_IF_ERROR(
+          FixInt(config_json, sb, "max_sequence_idle_microseconds"));
+      triton::common::TritonJson::Value oldest;
+      if (sb.Find("oldest", &oldest)) {
+        RETURN_IF_ERROR(
+            FixInt(config_json, oldest, "max_queue_delay_microseconds"));
+      }
+      triton::common::TritonJson::Value direct;
+      if (sb.Find("direct", &direct)) {
+        RETURN_IF_ERROR(
+            FixInt(config_json, direct, "max_queue_delay_microseconds"));
+      }
+
+      triton::common::TritonJson::Value states;
+      if (sb.Find("state", &states)) {
+        for (size_t i = 0; i < states.ArraySize(); ++i) {
+          triton::common::TritonJson::Value state;
+          RETURN_IF_ERROR(states.IndexAsObject(i, &state));
+          RETURN_IF_ERROR(FixIntArray(config_json, state, "dims"));
+
+          triton::common::TritonJson::Value initial_state;
+          if (sb.Find("initial_state", &initial_state)) {
+            RETURN_IF_ERROR(FixIntArray(config_json, initial_state, "dims"));
+          }
+        }
+      }
+    }
+  }
+
+  // Fix ensemble_scheduling::step::model_version.
+  {
+    triton::common::TritonJson::Value ens;
+    if (config_json.Find("ensemble_scheduling", &ens)) {
+      triton::common::TritonJson::Value step;
+      if (ens.Find("step", &step)) {
+        RETURN_IF_ERROR(FixObjectArray(config_json, step, "model_version"));
+      }
+    }
+  }
+
+  // Fix model_warmup::inputs::value::dims.
+  {
+    triton::common::TritonJson::Value warmups;
+    if (config_json.Find("model_warmup", &warmups)) {
+      for (size_t i = 0; i < warmups.ArraySize(); ++i) {
+        triton::common::TritonJson::Value warmup;
+        RETURN_IF_ERROR(warmups.IndexAsObject(i, &warmup));
+        triton::common::TritonJson::Value inputs;
+        if (warmup.Find("inputs", &inputs)) {
+          std::vector<std::string> members;
+          RETURN_IF_ERROR(inputs.Members(&members));
+          for (const auto& m : members) {
+            triton::common::TritonJson::Value input;
+            RETURN_IF_ERROR(inputs.MemberAsObject(m.c_str(), &input));
+            RETURN_IF_ERROR(FixIntArray(config_json, input, "dims"));
+          }
+        }
+      }
+    }
+  }
+
+  // Convert fixed json back the string...
+  triton::common::TritonJson::WriteBuffer buffer;
+  RETURN_IF_ERROR(config_json.Write(&buffer));
+  *json_str = std::move(buffer.MutableContents());
+
+  return Status::Success;
+}
+
+Status
+JsonToModelConfig(
+    const std::string& json_config, const uint32_t config_version,
+    inference::ModelConfig* protobuf_config)
+{
+  // Currently only support 'config_version' 1, which is the json
+  // representation of the ModelConfig protobuf matches the representation in
+  // ModelConfigToJson().
+  if (config_version != 1) {
+    return Status(
+        Status::Code::INVALID_ARG,
+        std::string("model configuration version ") +
+            std::to_string(config_version) +
+            " not supported, supported versions are: 1");
+  }
+
+  ::google::protobuf::util::JsonParseOptions options;
+  options.case_insensitive_enum_parsing = true;
+  options.ignore_unknown_fields = false;
+  auto err = ::google::protobuf::util::JsonStringToMessage(
+      json_config, protobuf_config, options);
+  if (!err.ok()) {
+    return Status(Status::Code::INVALID_ARG, std::string(err.message()));
+  }
+
+  return Status::Success;
+}
+
+BackendType
+GetBackendTypeFromPlatform(const std::string& platform_name)
+{
+  if ((platform_name == kTensorFlowGraphDefPlatform) ||
+      (platform_name == kTensorFlowSavedModelPlatform)) {
+    return BackendType::BACKEND_TYPE_TENSORFLOW;
+  }
+
+  if (platform_name == kTensorRTPlanPlatform) {
+    return BackendType::BACKEND_TYPE_TENSORRT;
+  }
+
+  if (platform_name == kOnnxRuntimeOnnxPlatform) {
+    return BackendType::BACKEND_TYPE_ONNXRUNTIME;
+  }
+
+  if (platform_name == kPyTorchLibTorchPlatform) {
+    return BackendType::BACKEND_TYPE_PYTORCH;
+  }
+
+  return BackendType::BACKEND_TYPE_UNKNOWN;
+}
+
+/// Get the BackendType value for a backend name.
+/// \param backend_name The backend name.
+/// \return The BackendType or BackendType::UNKNOWN if the platform string
+/// is not recognized.
+BackendType
+GetBackendType(const std::string& backend_name)
+{
+  if (backend_name == kTensorFlowBackend) {
+    return BackendType::BACKEND_TYPE_TENSORFLOW;
+  }
+
+  if (backend_name == kTensorRTBackend) {
+    return BackendType::BACKEND_TYPE_TENSORRT;
+  }
+
+  if (backend_name == kOnnxRuntimeBackend) {
+    return BackendType::BACKEND_TYPE_ONNXRUNTIME;
+  }
+
+  if (backend_name == kPyTorchBackend) {
+    return BackendType::BACKEND_TYPE_PYTORCH;
+  }
+
+  return BackendType::BACKEND_TYPE_UNKNOWN;
+}
+
+TRITONSERVER_DataType
+DataTypeToTriton(const inference::DataType dtype)
+{
+  switch (dtype) {
+    case inference::DataType::TYPE_BOOL:
+      return TRITONSERVER_TYPE_BOOL;
+    case inference::DataType::TYPE_UINT8:
+      return TRITONSERVER_TYPE_UINT8;
+    case inference::DataType::TYPE_UINT16:
+      return TRITONSERVER_TYPE_UINT16;
+    case inference::DataType::TYPE_UINT32:
+      return TRITONSERVER_TYPE_UINT32;
+    case inference::DataType::TYPE_UINT64:
+      return TRITONSERVER_TYPE_UINT64;
+    case inference::DataType::TYPE_INT8:
+      return TRITONSERVER_TYPE_INT8;
+    case inference::DataType::TYPE_INT16:
+      return TRITONSERVER_TYPE_INT16;
+    case inference::DataType::TYPE_INT32:
+      return TRITONSERVER_TYPE_INT32;
+    case inference::DataType::TYPE_INT64:
+      return TRITONSERVER_TYPE_INT64;
+    case inference::DataType::TYPE_FP16:
+      return TRITONSERVER_TYPE_FP16;
+    case inference::DataType::TYPE_FP32:
+      return TRITONSERVER_TYPE_FP32;
+    case inference::DataType::TYPE_FP64:
+      return TRITONSERVER_TYPE_FP64;
+    case inference::DataType::TYPE_STRING:
+      return TRITONSERVER_TYPE_BYTES;
+    case inference::DataType::TYPE_BF16:
+      return TRITONSERVER_TYPE_BF16;
+    default:
+      break;
+  }
+
+  return TRITONSERVER_TYPE_INVALID;
+}
+
+inference::DataType
+TritonToDataType(const TRITONSERVER_DataType dtype)
+{
+  switch (dtype) {
+    case TRITONSERVER_TYPE_BOOL:
+      return inference::DataType::TYPE_BOOL;
+    case TRITONSERVER_TYPE_UINT8:
+      return inference::DataType::TYPE_UINT8;
+    case TRITONSERVER_TYPE_UINT16:
+      return inference::DataType::TYPE_UINT16;
+    case TRITONSERVER_TYPE_UINT32:
+      return inference::DataType::TYPE_UINT32;
+    case TRITONSERVER_TYPE_UINT64:
+      return inference::DataType::TYPE_UINT64;
+    case TRITONSERVER_TYPE_INT8:
+      return inference::DataType::TYPE_INT8;
+    case TRITONSERVER_TYPE_INT16:
+      return inference::DataType::TYPE_INT16;
+    case TRITONSERVER_TYPE_INT32:
+      return inference::DataType::TYPE_INT32;
+    case TRITONSERVER_TYPE_INT64:
+      return inference::DataType::TYPE_INT64;
+    case TRITONSERVER_TYPE_FP16:
+      return inference::DataType::TYPE_FP16;
+    case TRITONSERVER_TYPE_FP32:
+      return inference::DataType::TYPE_FP32;
+    case TRITONSERVER_TYPE_FP64:
+      return inference::DataType::TYPE_FP64;
+    case TRITONSERVER_TYPE_BYTES:
+      return inference::DataType::TYPE_STRING;
+    case TRITONSERVER_TYPE_BF16:
+      return inference::DataType::TYPE_BF16;
+    default:
+      break;
+  }
+
+  return inference::DataType::TYPE_INVALID;
+}
+
+}}  // namespace triton::core
--- a/3rdparty/core-r22.12/src/model_config_utils.h
+++ b/3rdparty/core-r22.12/src/model_config_utils.h
+// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "model_config.pb.h"
+#include "status.h"
+#include "triton/common/model_config.h"
+#include "tritonserver_apis.h"
+#include "filesystem.h"
+
+namespace triton { namespace core {
+
+/// Enumeration for the different backend types.
+enum BackendType {
+  BACKEND_TYPE_UNKNOWN = 0,
+  BACKEND_TYPE_TENSORRT = 1,
+  BACKEND_TYPE_TENSORFLOW = 2,
+  BACKEND_TYPE_ONNXRUNTIME = 3,
+  BACKEND_TYPE_PYTORCH = 4
+};
+
+// Get version of a model from the path containing the model
+/// definition file.
+/// \param path The path to the model definition file.
+/// \param version Returns the version.
+/// \return The error status.
+Status GetModelVersionFromPath(const std::string& path, int64_t* version);
+
+/// Get the tensor name, false value, and true value for a boolean
+/// sequence batcher control kind. If 'required' is true then must
+/// find a tensor for the control. If 'required' is false, return
+/// 'tensor_name' as empty-string if the control is not mapped to any
+/// tensor.
+Status GetBooleanSequenceControlProperties(
+    const inference::ModelSequenceBatching& batcher,
+    const std::string& model_name,
+    const inference::ModelSequenceBatching::Control::Kind control_kind,
+    const bool required, std::string* tensor_name,
+    inference::DataType* tensor_datatype, float* fp32_false_value,
+    float* fp32_true_value, int32_t* int32_false_value,
+    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value);
+
+/// Get the tensor name and datatype for a non-boolean sequence
+/// batcher control kind. If 'required' is true then must find a
+/// tensor for the control. If 'required' is false, return
+/// 'tensor_name' as empty-string if the control is not mapped to any
+/// tensor. 'tensor_datatype' returns the required datatype for the
+/// control.
+Status GetTypedSequenceControlProperties(
+    const inference::ModelSequenceBatching& batcher,
+    const std::string& model_name,
+    const inference::ModelSequenceBatching::Control::Kind control_kind,
+    const bool required, std::string* tensor_name,
+    inference::DataType* tensor_datatype);
+
+/// Read a ModelConfig and normalize it as expected by model backends.
+/// \param path The full-path to the directory containing the
+/// model configuration.
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \param config Returns the normalized model configuration.
+/// \return The error status.
+Status GetNormalizedModelConfig(
+    const std::string& model_name, const std::string& path,
+    const double min_compute_capability, inference::ModelConfig* config);
+
+/// Auto-complete backend related fields (platform, backend and default model
+/// filename) if not set, note that only Triton recognized backends will be
+/// checked.
+/// \param model_name The name of the model.
+/// \param model_path The full-path to the directory containing the
+/// model configuration.
+/// \param config Returns the auto-completed model configuration.
+/// \return The error status.
+Status AutoCompleteBackendFields(
+    const std::string& model_name, const std::string& model_path,
+    inference::ModelConfig* config);
+
+/// Detects and adds missing fields in the model configuration.
+/// \param min_compute_capability The minimum supported CUDA compute
+/// capability.
+/// \param config The model configuration
+/// \return The error status
+Status NormalizeModelConfig(
+    const double min_compute_capability, inference::ModelConfig* config);
+
+/// [FIXME] better formalize config normalization / validation
+/// Detects and adds missing fields in instance group setting.
+/// \param min_compute_capability The minimum supported CUDA compute
+/// capability.
+/// \param config The model configuration
+/// \return The error status
+Status NormalizeInstanceGroup(
+    const double min_compute_capability,
+    const std::vector<inference::ModelInstanceGroup>& preferred_groups,
+    inference::ModelConfig* config);
+
+/// [FIXME] Remove once a more permanent solution is implemented  (DLIS-4211)
+/// Localize EXECUTION_ENV_PATH in python backend.
+/// \param model_path The full-path to the directory containing the model
+/// configuration, before localization.
+/// \param config The model configuration
+/// \param localized_model_dir The localized model directory
+/// \return The error status
+Status LocalizePythonBackendExecutionEnvironmentPath(
+    const std::string& model_path, inference::ModelConfig* config,
+    std::shared_ptr<LocalizedPath>* localized_model_dir);
+
+/// Auto-complete the instance count based on instance kind and backend name.
+/// \param group The instance group to set the count for.
+/// \param backend The backend name to check against.
+/// \return The error status.
+Status SetDefaultInstanceCount(
+    inference::ModelInstanceGroup* group, const std::string& backend);
+
+/// Validate that a model is specified correctly, except for model inputs
+/// and outputs. ValidateModelIOConfig() should be called to
+/// validate model inputs and outputs.
+/// \param config The model configuration to validate.
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \return The error status. A non-OK status indicates the configuration
+/// is not valid.
+Status ValidateModelConfig(
+    const inference::ModelConfig& config, const double min_compute_capability);
+
+/// [FIXME] better formalize config normalization / validation
+/// Validate instance group setting.
+/// \param config The model configuration to validate.
+/// \param min_compute_capability The minimum support CUDA compute
+/// capability.
+/// \return The error status. A non-OK status indicates the configuration
+/// is not valid.
+Status ValidateInstanceGroup(
+    const inference::ModelConfig& config, const double min_compute_capability);
+
+/// Validate that a model inputs and outputs are specified correctly.
+/// \param config The model configuration to validate.
+/// \return The error status. A non-OK status indicates the configuration
+/// is not valid.
+Status ValidateModelIOConfig(const inference::ModelConfig& config);
+
+/// Validate that input is specified correctly in a model
+/// configuration.
+/// \param io The model input.
+/// \param max_batch_size The max batch size specified in model configuration.
+/// \param platform The platform name
+/// \return The error status. A non-OK status indicates the input
+/// is not valid.
+Status ValidateModelInput(
+    const inference::ModelInput& io, int32_t max_batch_size,
+    const std::string& platform);
+
+/// Validate that an input matches one of the allowed input names.
+/// \param io The model input.
+/// \param allowed The set of allowed input names.
+/// \return The error status. A non-OK status indicates the input
+/// is not valid.
+Status CheckAllowedModelInput(
+    const inference::ModelInput& io, const std::set<std::string>& allowed);
+
+/// Validate that an output is specified correctly in a model
+/// configuration.
+/// \param io The model output.
+/// \param max_batch_size The max batch size specified in model configuration.
+/// \param platform The platform name
+/// \return The error status. A non-OK status indicates the output
+/// is not valid.
+Status ValidateModelOutput(
+    const inference::ModelOutput& io, int32_t max_batch_size,
+    const std::string& platform);
+
+/// Validate that an output matches one of the allowed output names.
+/// \param io The model output.
+/// \param allowed The set of allowed output names.
+/// \return The error status. A non-OK status indicates the output
+/// is not valid.
+Status CheckAllowedModelOutput(
+    const inference::ModelOutput& io, const std::set<std::string>& allowed);
+
+/// Validate that a model batch inputs and batch outputs are specified
+/// correctly.
+/// \param config The model configuration to validate..
+/// \return The error status. A non-OK status indicates the batch inputs or
+/// batch outputs are not valid.
+Status ValidateBatchIO(const inference::ModelConfig& config);
+
+/// Parse the 'value' of the parameter 'key' into a boolean value.
+/// \param key The name of the parameter.
+/// \param value The value of the parameter in string.
+/// \param parsed_value Return the boolean of the parameter.
+/// \return The error status. A non-OK status indicates failure on parsing the
+/// value.
+Status ParseBoolParameter(
+    const std::string& key, std::string value, bool* parsed_value);
+
+/// Parse the 'value' of the parameter 'key' into a long long integer value.
+/// \param key The name of the parameter.
+/// \param value The value of the parameter in string.
+/// \param parsed_value Return the numerical value of the parameter.
+/// \return The error status. A non-OK status indicates failure on parsing the
+/// value.
+Status ParseLongLongParameter(
+    const std::string& key, const std::string& value, int64_t* parsed_value);
+
+/// Obtain the 'profile_index' of the 'profile_name'.
+/// \param profile_name The name of the profile.
+/// \param profile_index Return the index of the profile.
+/// \return The error status. A non-OK status indicates failure on getting the
+/// value.
+Status GetProfileIndex(const std::string& profile_name, int* profile_index);
+
+/// Convert a model configuration protobuf to the equivalent json.
+/// \param config The protobuf model configuration.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned.
+/// \param json Returns the equivalent JSON.
+/// \return The error status.
+Status ModelConfigToJson(
+    const inference::ModelConfig& config, const uint32_t config_version,
+    std::string* json_str);
+
+/// Convert a model configuration JSON to the equivalent protobuf.
+/// \param config The JSON model configuration.
+/// \param config_version The model configuration will be returned in
+/// a format matching this version. If the configuration cannot be
+/// represented in the requested version's format then an error will
+/// be returned.
+/// \param protobuf Returns the equivalent protobuf.
+/// \return The error status.
+Status JsonToModelConfig(
+    const std::string& json_config, const uint32_t config_version,
+    inference::ModelConfig* protobuf_config);
+
+/// Get the BackendType value for a platform name.
+/// \param platform_name The platform name.
+/// \return The BackendType or BackendType::UNKNOWN if the platform string
+/// is not recognized.
+BackendType GetBackendTypeFromPlatform(const std::string& platform_name);
+
+/// Get the BackendType value for a backend name.
+/// \param backend_name The backend name.
+/// \return The BackendType or BackendType::UNKNOWN if the platform string
+/// is not recognized.
+BackendType GetBackendType(const std::string& backend_name);
+
+/// Get the Triton server data type corresponding to a data type.
+/// \param dtype The data type.
+/// \return The Triton server data type.
+TRITONSERVER_DataType DataTypeToTriton(const inference::DataType dtype);
+
+/// Get the data type corresponding to a Triton server data type.
+/// \param dtype The Triton server data type.
+/// \return The data type.
+inference::DataType TritonToDataType(const TRITONSERVER_DataType dtype);
+
+}}  // namespace triton::core