Initial commit

c68e1835 · lijian6 · c68e1835 · c68e1835 · c68e1835 · c68e1835
Commit c68e1835 authored Sep 18, 2023 by lijian6
20 changed files
--- a/src/c++/perf_analyzer/infer_data_manager.cc
+++ b/src/c++/perf_analyzer/infer_data_manager.cc
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_data_manager.h"
+
+#include <algorithm>
+
+namespace triton { namespace perfanalyzer {
+
+cb::Error
+InferDataManager::Init()
+{
+  RETURN_IF_ERROR(CreateAndPopulateInputs());
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManager::CreateAndPopulateInputs()
+{
+  // All combinations of thread + input + stream + step
+  //
+  for (size_t thread_id = 0; thread_id < max_threads_; thread_id++) {
+    for (const auto& input : *(parser_->Inputs())) {
+      const std::string& name = input.first;
+      const ModelTensor& tensor = input.second;
+      for (int stream_id = 0;
+           stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) {
+        for (int step_id = 0;
+             step_id < (int)data_loader_->GetTotalSteps(stream_id);
+             step_id += 1) {
+          RETURN_IF_ERROR(CreateAndPopulateInput(
+              thread_id, name, tensor, stream_id, step_id));
+        }
+      }
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManager::CreateAndPopulateInput(
+    const size_t thread_id, const std::string& name, const ModelTensor& tensor,
+    int stream_id, int step_id)
+{
+  std::vector<TensorData> input_datas;
+  size_t count = 0;
+
+  RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas));
+
+  if (tensor.is_shape_tensor_) {
+    RETURN_IF_ERROR(
+        ValidateShapeTensor(tensor, stream_id, step_id, input_datas));
+  }
+
+  std::vector<int64_t> shape;
+  RETURN_IF_ERROR(
+      data_loader_->GetInputShape(tensor, stream_id, step_id, &shape));
+  if (!shape.empty()) {
+    if ((parser_->MaxBatchSize() != 0) && (!tensor.is_shape_tensor_)) {
+      shape.insert(shape.begin(), (int64_t)batch_size_);
+    }
+  }
+
+  cb::InferInput* input;
+  RETURN_IF_ERROR(
+      CreateInferInput(&input, backend_kind_, name, shape, tensor.datatype_));
+
+
+  // Number of missing pieces of data for optional inputs
+  int missing_data_cnt = 0;
+  int total_cnt = input_datas.size();
+
+  for (size_t i = 0; i < total_cnt; i++) {
+    if (!input_datas[i].is_valid) {
+      missing_data_cnt++;
+    } else {
+      RETURN_IF_ERROR(input->AppendRaw(
+          input_datas[i].data_ptr, input_datas[i].batch1_size));
+    }
+  }
+
+  // If all optional inputs had data provided, this is a valid input. But if
+  // some inferences in the batch provided data for an optional input and
+  // some inferences did not, this is an invalid case and an error is
+  // thrown.
+  if (missing_data_cnt == 0) {
+    inputs_.insert({{thread_id, name, stream_id, step_id}, input});
+  } else if (missing_data_cnt > 0 && missing_data_cnt < total_cnt) {
+    return cb::Error(
+        "For batch sizes larger than 1, the same set of inputs must be "
+        "specified for each batch. You cannot use different set of "
+        "optional inputs for each individual batch.");
+  }
+
+  return cb::Error::Success;
+}
+
+cb::InferInput*
+InferDataManager::GetInput(
+    const size_t thread_id, const std::string& name, int stream_id, int step_id)
+{
+  auto input = inputs_.find({thread_id, name, stream_id, step_id});
+  if (input == inputs_.end()) {
+    return nullptr;
+  } else {
+    return input->second;
+  }
+}
+
+
+cb::Error
+InferDataManager::InitInferDataInput(
+    const std::string& name, const ModelTensor& model_tensor,
+    InferData& infer_data)
+{
+  std::vector<int64_t> shape;
+  RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape));
+  if (shape.empty() && (backend_kind_ == cb::BackendKind::TRITON)) {
+    return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR);
+  }
+
+  if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) {
+    shape.insert(shape.begin(), (int64_t)batch_size_);
+  }
+
+  cb::InferInput* infer_input;
+  RETURN_IF_ERROR(CreateInferInput(
+      &infer_input, backend_kind_, name, shape, model_tensor.datatype_));
+  infer_data.inputs_.push_back(infer_input);
+
+
+  TensorData input_data;
+  RETURN_IF_ERROR(data_loader_->GetInputData(model_tensor, 0, 0, input_data));
+
+  // Add optional input to request if data was found
+  if (input_data.is_valid) {
+    infer_data.valid_inputs_.push_back(infer_input);
+  }
+
+  if (!shape.empty()) {
+    size_t max_count = (parser_->MaxBatchSize() == 0) ? 1 : batch_size_;
+    for (size_t i = 0; i < max_count; ++i) {
+      RETURN_IF_ERROR(
+          infer_input->AppendRaw(input_data.data_ptr, input_data.batch1_size));
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManager::InitInferDataOutput(
+    const std::string& name, InferData& infer_data)
+{
+  cb::InferRequestedOutput* requested_output;
+  RETURN_IF_ERROR(
+      cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
+  infer_data.outputs_.push_back(requested_output);
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManager::UpdateInputs(
+    const size_t thread_id, const int stream_index, const int step_index,
+    InferData& infer_data)
+{
+  // Reset inputs for this inference request
+  infer_data.valid_inputs_.clear();
+
+  for (const auto& input : infer_data.inputs_) {
+    const auto& name = input->Name();
+
+    cb::InferInput* tmp_input =
+        GetInput(thread_id, name, stream_index, step_index);
+    if (tmp_input != nullptr) {
+      infer_data.valid_inputs_.push_back(tmp_input);
+    }
+  }
+  return cb::Error::Success;
+}
+
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager.h
+++ b/src/c++/perf_analyzer/infer_data_manager.h
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "client_backend/client_backend.h"
+#include "constants.h"
+#include "data_loader.h"
+#include "infer_data.h"
+#include "infer_data_manager_base.h"
+#include "model_parser.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Manages infer data to prepare an inference request and the resulting
+/// inference output from triton server
+class InferDataManager : public InferDataManagerBase {
+ public:
+  InferDataManager(
+      const size_t max_threads, const int32_t batch_size,
+      const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+      : max_threads_(max_threads),
+        InferDataManagerBase(batch_size, parser, factory, data_loader)
+  {
+  }
+
+  /// Initialize this object. Must be called before any other functions
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Init() override;
+
+ protected:
+  const size_t max_threads_{1};
+  std::map<std::tuple<size_t, std::string, int, int>, cb::InferInput*> inputs_;
+
+  cb::Error CreateAndPopulateInputs();
+  cb::Error CreateAndPopulateInput(
+      const size_t thread_id, const std::string& name,
+      const ModelTensor& model_tensor, int stream_id, int step_id);
+
+  cb::InferInput* GetInput(
+      const size_t thread_id, const std::string& name, int stream_id,
+      int step_id);
+
+  cb::Error InitInferDataInput(
+      const std::string& name, const ModelTensor& model_tensor,
+      InferData& infer_data) override;
+
+  cb::Error InitInferDataOutput(
+      const std::string& name, InferData& infer_data) override;
+
+  /// Helper function to update the inputs
+  /// \param thread_id The ID of the calling thread
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  cb::Error UpdateInputs(
+      const size_t thread_id, const int stream_index, const int step_index,
+      InferData& infer_data);
+
+#ifndef DOCTEST_CONFIG_DISABLE
+ public:
+  InferDataManager() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager_base.cc
+++ b/src/c++/perf_analyzer/infer_data_manager_base.cc
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_data_manager_base.h"
+
+#include <algorithm>
+
+namespace triton { namespace perfanalyzer {
+
+cb::Error
+InferDataManagerBase::GetInputData(
+    const std::string& name, const ModelTensor& tensor, int stream_id,
+    int step_id, std::vector<TensorData>& input_datas)
+{
+  size_t max_count = tensor.is_shape_tensor_ ? 1 : batch_size_;
+  std::vector<int64_t> shape;
+  std::vector<int64_t> prev_shape;
+
+  for (size_t count = 0; count < max_count; count++) {
+    int local_step_id =
+        (step_id + count) % data_loader_->GetTotalSteps(stream_id);
+
+    TensorData input_data;
+
+    RETURN_IF_ERROR(
+        data_loader_->GetInputShape(tensor, stream_id, local_step_id, &shape));
+    if (!shape.empty()) {
+      if (count == 0) {
+        prev_shape = shape;
+      } else {
+        if (!std::equal(shape.begin(), shape.end(), prev_shape.begin())) {
+          return cb::Error(
+              "can not batch tensors with different shapes together "
+              "(input '" +
+                  name + "' expected shape " + ShapeVecToString(prev_shape) +
+                  " and received " + ShapeVecToString(shape),
+              pa::GENERIC_ERROR);
+        }
+      }
+    }
+
+    RETURN_IF_ERROR(data_loader_->GetInputData(
+        tensor, stream_id, local_step_id, input_data));
+
+    input_datas.push_back(input_data);
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerBase::ValidateShapeTensor(
+    const ModelTensor& tensor, int stream_id, int step_id,
+    const std::vector<TensorData>& input_datas)
+{
+  // Validate that steps 1 through N are exactly the same as step 0, since step
+  // 0 is the only one we send for shape tensors
+  for (size_t count = 1; count < batch_size_; count++) {
+    int local_step_id =
+        (step_id + count) % data_loader_->GetTotalSteps(stream_id);
+
+    TensorData input_data;
+    RETURN_IF_ERROR(data_loader_->GetInputData(
+        tensor, stream_id, local_step_id, input_data));
+
+    if (input_data.batch1_size != input_datas.back().batch1_size) {
+      return cb::Error(
+          "The shape tensors should be identical in a batch (mismatch "
+          "in size)",
+          pa::GENERIC_ERROR);
+    }
+
+    for (size_t data_idx = 0; data_idx < input_data.batch1_size; data_idx++) {
+      if (*(input_data.data_ptr + data_idx) !=
+          *(input_datas.back().data_ptr + data_idx)) {
+        return cb::Error(
+            "The shape tensors should be identical in a batch "
+            "(mismatch in content)",
+            pa::GENERIC_ERROR);
+      }
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerBase::InitInferData(InferData& infer_data)
+{
+  // Initialize inputs
+  for (const auto& input : *(parser_->Inputs())) {
+    RETURN_IF_ERROR(InitInferDataInput(input.first, input.second, infer_data));
+  }
+
+  for (const auto& output : *(parser_->Outputs())) {
+    RETURN_IF_ERROR(InitInferDataOutput(output.first, infer_data));
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerBase::UpdateInferData(
+    size_t thread_id, int stream_index, int step_index, InferData& infer_data)
+{
+  RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index));
+  RETURN_IF_ERROR(
+      UpdateInputs(thread_id, stream_index, step_index, infer_data));
+  RETURN_IF_ERROR(
+      UpdateValidationOutputs(stream_index, step_index, infer_data));
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerBase::UpdateValidationOutputs(
+    int stream_index, int step_index, InferData& infer_data)
+{
+  RETURN_IF_ERROR(data_loader_->ValidateIndexes(stream_index, step_index));
+
+  infer_data.expected_outputs_.clear();
+
+  for (const auto& output : infer_data.outputs_) {
+    const auto& model_output = (*(parser_->Outputs()))[output->Name()];
+
+    TensorData output_data;
+    const int* set_shape_values = nullptr;
+    int set_shape_value_cnt = 0;
+
+    std::vector<TensorData> outputs;
+    for (size_t i = 0; i < batch_size_; ++i) {
+      RETURN_IF_ERROR(data_loader_->GetOutputData(
+          output->Name(), stream_index,
+          (step_index + i) % data_loader_->GetTotalSteps(0), output_data));
+      if (!output_data.is_valid) {
+        break;
+      }
+
+      outputs.emplace_back(output_data);
+      // Shape tensor only need the first batch element
+      if (model_output.is_shape_tensor_) {
+        break;
+      }
+    }
+    if (!outputs.empty()) {
+      infer_data.expected_outputs_.emplace_back(std::move(outputs));
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerBase::CreateInferInput(
+    cb::InferInput** infer_input, const cb::BackendKind kind,
+    const std::string& name, const std::vector<int64_t>& dims,
+    const std::string& datatype)
+{
+  return cb::InferInput::Create(infer_input, kind, name, dims, datatype);
+}
+
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager_base.h
+++ b/src/c++/perf_analyzer/infer_data_manager_base.h
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "client_backend/client_backend.h"
+#include "constants.h"
+#include "data_loader.h"
+#include "iinfer_data_manager.h"
+#include "infer_data.h"
+#include "model_parser.h"
+#include "perf_utils.h"
+#include "tensor_data.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Base class for Infer Data managers
+///
+class InferDataManagerBase : public IInferDataManager {
+ public:
+  InferDataManagerBase(
+      const int32_t batch_size, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+      : batch_size_(batch_size), parser_(parser), factory_(factory),
+        data_loader_(data_loader), backend_kind_(factory->Kind())
+  {
+  }
+
+  /// Populate the target InferData object with input and output objects
+  /// according to the model's shape
+  /// \param infer_data The target InferData object.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error InitInferData(InferData& infer_data) override;
+
+  /// Updates the input data to use for inference request
+  /// \param thread_id The ID of the calling thread
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  cb::Error UpdateInferData(
+      size_t thread_id, int stream_index, int step_index,
+      InferData& infer_data) override;
+
+ protected:
+  size_t batch_size_;
+  std::shared_ptr<ModelParser> parser_;
+  std::shared_ptr<cb::ClientBackendFactory> factory_;
+  std::shared_ptr<DataLoader> data_loader_;
+  std::unique_ptr<cb::ClientBackend> backend_;
+  cb::BackendKind backend_kind_;
+
+  /// Gets the input data for the specified input for the specified batch size
+  ///
+  /// \param name The name of the input to get data for
+  /// \param tensor The ModelTensor of the input to get data for
+  /// \param stream_id The ID of the stream to get data for
+  /// \param step_id The ID of the step within the stream
+  /// \param input_datas The returned vector of TensorDatas
+  /// \return cb::Error object indicating success or failure.
+  cb::Error GetInputData(
+      const std::string& name, const ModelTensor& tensor, int stream_id,
+      int step_id, std::vector<TensorData>& input_datas);
+
+  /// For the case of an input with is_shape_tensor true, validate that
+  /// it follows all rules, and throw an error if it does not
+  /// \param tensor The ModelTensor of the input to validate
+  /// \param stream_id The ID of the stream to validate
+  /// \param step_id The ID of the step within the stream
+  /// \param input_datas vector of TensorDatas to validate
+  /// \return cb::Error object indicating success or failure.
+  cb::Error ValidateShapeTensor(
+      const ModelTensor& tensor, int stream_id, int step_id,
+      const std::vector<TensorData>& input_datas);
+
+  /// Helper function to update the inputs
+  /// \param thread_id The ID of the calling thread
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error UpdateInputs(
+      const size_t thread_id, const int stream_index, const int step_index,
+      InferData& infer_data) = 0;
+
+  /// Updates the expected output data to use for inference request. Empty
+  /// vector will be returned if there is no expected output associated to the
+  /// step.
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  cb::Error UpdateValidationOutputs(
+      int stream_index, int step_index, InferData& infer_data);
+
+  /// Creates inference input object
+  /// \param infer_input Output parameter storing newly created inference input
+  /// \param kind Backend kind
+  /// \param name Name of inference input
+  /// \param dims Shape of inference input
+  /// \param datatype Data type of inference input
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error CreateInferInput(
+      cb::InferInput** infer_input, const cb::BackendKind kind,
+      const std::string& name, const std::vector<int64_t>& dims,
+      const std::string& datatype);
+
+  virtual cb::Error InitInferDataInput(
+      const std::string& name, const ModelTensor& model_tensor,
+      InferData& infer_data) = 0;
+
+  virtual cb::Error InitInferDataOutput(
+      const std::string& name, InferData& infer_data) = 0;
+
+#ifndef DOCTEST_CONFIG_DISABLE
+ public:
+  InferDataManagerBase() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager_factory.h
+++ b/src/c++/perf_analyzer/infer_data_manager_factory.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "data_loader.h"
+#include "iinfer_data_manager.h"
+#include "infer_data_manager.h"
+#include "infer_data_manager_shm.h"
+#include "model_parser.h"
+
+namespace triton { namespace perfanalyzer {
+
+class InferDataManagerFactory {
+ public:
+  static std::shared_ptr<IInferDataManager> CreateInferDataManager(
+      const size_t max_threads, const int32_t batch_size,
+      const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+      const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+  {
+    if (shared_memory_type == SharedMemoryType::NO_SHARED_MEMORY) {
+      return CreateInferDataManagerNoShm(
+          max_threads, batch_size, parser, factory, data_loader);
+    } else {
+      return CreateInferDataManagerShm(
+          batch_size, shared_memory_type, output_shm_size, parser, factory,
+          data_loader);
+    }
+  }
+
+ private:
+  static std::shared_ptr<IInferDataManager> CreateInferDataManagerNoShm(
+      const size_t max_threads, const int32_t batch_size,
+      const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+  {
+    return std::make_shared<InferDataManager>(
+        max_threads, batch_size, parser, factory, data_loader);
+  }
+
+  static std::shared_ptr<IInferDataManager> CreateInferDataManagerShm(
+      const int32_t batch_size, const SharedMemoryType shared_memory_type,
+      const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+  {
+    return std::make_shared<InferDataManagerShm>(
+        batch_size, shared_memory_type, output_shm_size, parser, factory,
+        data_loader);
+  }
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager_shm.cc
+++ b/src/c++/perf_analyzer/infer_data_manager_shm.cc
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_data_manager_shm.h"
+
+#include <algorithm>
+
+namespace triton { namespace perfanalyzer {
+
+InferDataManagerShm::~InferDataManagerShm()
+{
+  cb::Error err;
+  if (backend_.get() != nullptr) {
+    err = backend_->UnregisterAllSharedMemory();
+    if (!err.IsOk()) {
+      std::cerr << "Unable to unregister all shared memory regions"
+                << std::endl;
+    }
+    if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
+      for (auto& region : shared_memory_regions_) {
+        if (factory_->Kind() !=
+            triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
+          err = backend_->UnmapSharedMemory(
+              shared_memory_regions_[region.first].data_.get(),
+              shared_memory_regions_[region.first].byte_size_);
+          if (!err.IsOk()) {
+            std::cerr << "Unable to unmap shared memory with key ("
+                      << region.first << "): Starting: "
+                      << static_cast<void*>(
+                             shared_memory_regions_[region.first].data_.get())
+                      << ", size: "
+                      << shared_memory_regions_[region.first].byte_size_
+                      << std::endl;
+          }
+          err = backend_->UnlinkSharedMemoryRegion(region.first);
+          if (!err.IsOk()) {
+            std::cerr << "Unable to unlink shared memory with key: "
+                      << region.first << std::endl;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+cb::Error
+InferDataManagerShm::Init()
+{
+  // TMA-1062 remove the factory from this class and use only the backend
+  RETURN_IF_ERROR(factory_->CreateClientBackend(&backend_));
+  // Calling this function for the clean start
+  backend_->UnregisterAllSharedMemory();
+
+  RETURN_IF_ERROR(CreateOutputMemoryRegions());
+  RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegions());
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::CreateOutputMemoryRegions()
+{
+  // Allocate the shared memory for outputs
+  for (const auto& output : *(parser_->Outputs())) {
+    const std::string& name = output.first;
+    const ModelTensor& tensor = output.second;
+    int64_t batch1_bytesize = ByteSize(tensor.shape_, tensor.datatype_);
+    if (batch1_bytesize < 0) {
+      batch1_bytesize = output_shm_size_;
+    }
+    uint8_t* output_shm_ptr;
+    size_t alloc_size = batch1_bytesize * batch_size_;
+    std::string region_name(TensorToRegionName(name));
+    RETURN_IF_ERROR(CreateMemoryRegion(
+        region_name, shared_memory_type_, alloc_size,
+        reinterpret_cast<void**>(&output_shm_ptr)));
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::CreateAndPopulateInputMemoryRegions()
+{
+  // All combinations of input + stream + step
+  //
+  for (const auto& input : *(parser_->Inputs())) {
+    const std::string& name = input.first;
+    const ModelTensor& tensor = input.second;
+    for (int stream_id = 0;
+         stream_id < (int)data_loader_->GetDataStreamsCount(); stream_id++) {
+      for (int step_id = 0;
+           step_id < (int)data_loader_->GetTotalSteps(stream_id);
+           step_id += 1) {
+        RETURN_IF_ERROR(CreateAndPopulateInputMemoryRegion(
+            name, tensor, stream_id, step_id));
+      }
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::CreateAndPopulateInputMemoryRegion(
+    const std::string& name, const ModelTensor& tensor, int stream_id,
+    int step_id)
+{
+  std::vector<TensorData> input_datas;
+  size_t count = 0;
+
+  RETURN_IF_ERROR(GetInputData(name, tensor, stream_id, step_id, input_datas));
+
+  if (tensor.is_shape_tensor_) {
+    RETURN_IF_ERROR(
+        ValidateShapeTensor(tensor, stream_id, step_id, input_datas));
+  }
+
+  size_t alloc_size = 0;
+  for (size_t i = 0; i < input_datas.size(); i++) {
+    if (!input_datas[i].is_valid) {
+      return cb::Error(
+          "Shared memory support in Perf Analyzer does not support "
+          "optional inputs at this time");
+    }
+    alloc_size += input_datas[i].batch1_size;
+  }
+
+  // Generate the shared memory region name
+  std::string region_name(
+      TensorToRegionName(name) + "_" + std::to_string(stream_id) + "_" +
+      std::to_string(step_id));
+  uint8_t* input_shm_ptr;
+  RETURN_IF_ERROR(CreateMemoryRegion(
+      region_name, shared_memory_type_, alloc_size,
+      reinterpret_cast<void**>(&input_shm_ptr)));
+  RETURN_IF_ERROR(CopySharedMemory(
+      input_shm_ptr, input_datas, tensor.is_shape_tensor_, region_name));
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::CreateMemoryRegion(
+    const std::string& shm_region_name, const SharedMemoryType& memory_type,
+    const size_t byte_size, void** ptr)
+{
+  if (memory_type == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
+    if (factory_->Kind() ==
+        triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
+      *ptr = new uint8_t[byte_size];
+      RETURN_IF_ERROR(
+          backend_->RegisterSystemMemory(shm_region_name, *ptr, byte_size));
+
+      // Set free as the destructor.
+      shared_memory_regions_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(shm_region_name),
+          std::forward_as_tuple(SharedMemoryData(
+              byte_size,
+              std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
+                  reinterpret_cast<uint8_t*>(*ptr),
+                  [](uint8_t* memory) { free(memory); }))));
+    } else {
+      std::string shm_key("/" + shm_region_name);
+      int shm_fd_op;
+      RETURN_IF_ERROR(
+          backend_->CreateSharedMemoryRegion(shm_key, byte_size, &shm_fd_op));
+      RETURN_IF_ERROR(backend_->MapSharedMemory(shm_fd_op, 0, byte_size, ptr));
+
+      RETURN_IF_ERROR(backend_->RegisterSystemSharedMemory(
+          shm_region_name, shm_key, byte_size));
+
+      // No-op destruction
+      shared_memory_regions_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(shm_region_name),
+          std::forward_as_tuple(SharedMemoryData(
+              byte_size,
+              std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
+                  reinterpret_cast<uint8_t*>(*ptr), [](uint8_t* memory) {}))));
+    }
+  } else if (memory_type == SharedMemoryType::CUDA_SHARED_MEMORY) {
+#ifdef TRITON_ENABLE_GPU
+    cudaError_t cuda_err = cudaMalloc((void**)ptr, byte_size);
+    if (cuda_err != cudaSuccess) {
+      return cb::Error(
+          "unable to allocate memory of " + std::to_string(byte_size) +
+              " bytes on gpu for output: " +
+              std::string(cudaGetErrorString(cuda_err)),
+          pa::GENERIC_ERROR);
+    }
+
+    if (factory_->Kind() ==
+        triton::perfanalyzer::clientbackend::BackendKind::TRITON_C_API) {
+      RETURN_IF_ERROR(
+          backend_->RegisterCudaMemory(shm_region_name, *ptr, byte_size));
+
+      // Set cudaFree as the destructor
+      shared_memory_regions_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(shm_region_name),
+          std::forward_as_tuple(SharedMemoryData(
+              byte_size,
+              std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
+                  reinterpret_cast<uint8_t*>(*ptr),
+                  [shm_region_name, byte_size](uint8_t* memory) {
+                    cudaError_t cuda_err = cudaFree(memory);
+                    if (cuda_err != cudaSuccess) {
+                      std::cerr
+                          << "Unable to free cuda shared memory for "
+                          << shm_region_name
+                          << ": Starting: " << static_cast<void*>(memory)
+                          << ", size: " << byte_size
+                          << " bytes, Details: " << cudaGetErrorString(cuda_err)
+                          << std::endl;
+                    }
+                  }))));
+    } else {
+      cudaIpcMemHandle_t cuda_handle;
+      RETURN_IF_ERROR(
+          CreateCUDAIPCHandle(&cuda_handle, reinterpret_cast<void*>(*ptr)));
+      RETURN_IF_ERROR(backend_->RegisterCudaSharedMemory(
+          shm_region_name, cuda_handle, byte_size));
+
+      // No operation required for deleting the memory
+      shared_memory_regions_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(shm_region_name),
+          std::forward_as_tuple(SharedMemoryData(
+              byte_size,
+              std::unique_ptr<uint8_t, std::function<void(uint8_t*)>>(
+                  reinterpret_cast<uint8_t*>(*ptr), [](uint8_t* memory) {}))));
+    }
+#endif  // TRITON_ENABLE_GPU
+  } else {
+    return cb::Error(
+        "CreateMemoryRegion called with invalid memory region type.",
+        pa::GENERIC_ERROR);
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::CopySharedMemory(
+    uint8_t* input_shm_ptr, const std::vector<TensorData>& tensor_datas,
+    bool is_shape_tensor, std::string& region_name)
+{
+  if (shared_memory_type_ == SharedMemoryType::SYSTEM_SHARED_MEMORY) {
+    // Populate the region with data
+    size_t count = 0;
+    size_t offset = 0;
+    size_t max_count = is_shape_tensor ? 1 : batch_size_;
+    while (count < max_count) {
+      memcpy(
+          input_shm_ptr + offset, tensor_datas[count].data_ptr,
+          tensor_datas[count].batch1_size);
+      offset += tensor_datas[count].batch1_size;
+      count++;
+    }
+  } else {
+#ifdef TRITON_ENABLE_GPU
+    // Populate the region with data
+    size_t count = 0;
+    size_t offset = 0;
+    size_t max_count = is_shape_tensor ? 1 : batch_size_;
+    while (count < max_count) {
+      cudaError_t cuda_err = cudaMemcpy(
+          (void*)(input_shm_ptr + offset), (void*)tensor_datas[count].data_ptr,
+          tensor_datas[count].batch1_size, cudaMemcpyHostToDevice);
+      if (cuda_err != cudaSuccess) {
+        return cb::Error(
+            "Failed to copy data to cuda shared memory for " + region_name +
+                " : " + std::string(cudaGetErrorString(cuda_err)),
+            pa::GENERIC_ERROR);
+      }
+      offset += tensor_datas[count].batch1_size;
+      count++;
+    }
+#endif  // TRITON_ENABLE_GPU
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::InitInferDataInput(
+    const std::string& name, const ModelTensor& model_tensor,
+    InferData& infer_data)
+{
+  std::vector<int64_t> shape;
+  RETURN_IF_ERROR(data_loader_->GetInputShape(model_tensor, 0, 0, &shape));
+  if (!shape.empty()) {
+    if ((parser_->MaxBatchSize() != 0) && (!model_tensor.is_shape_tensor_)) {
+      shape.insert(shape.begin(), (int64_t)batch_size_);
+    }
+  } else {
+    return cb::Error("unable to set shape for the input", pa::GENERIC_ERROR);
+  }
+
+  cb::InferInput* infer_input;
+  RETURN_IF_ERROR(CreateInferInput(
+      &infer_input, backend_kind_, name, shape, model_tensor.datatype_));
+  infer_data.inputs_.push_back(infer_input);
+
+  // FIXME: TMA-765 - Shared memory mode does not support optional inputs,
+  // currently, and will be implemented in the associated story.
+  infer_data.valid_inputs_.push_back(infer_input);
+
+  std::string region_name(
+      TensorToRegionName(name) + "_" + std::to_string(0) + "_" +
+      std::to_string(0));
+  RETURN_IF_ERROR(infer_input->SetSharedMemory(
+      region_name, shared_memory_regions_[region_name].byte_size_));
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::InitInferDataOutput(
+    const std::string& name, InferData& infer_data)
+{
+  cb::InferRequestedOutput* requested_output;
+  RETURN_IF_ERROR(
+      cb::InferRequestedOutput::Create(&requested_output, backend_kind_, name));
+  infer_data.outputs_.push_back(requested_output);
+
+  std::string region_name(TensorToRegionName(name));
+  RETURN_IF_ERROR(requested_output->SetSharedMemory(
+      region_name, shared_memory_regions_[region_name].byte_size_));
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferDataManagerShm::UpdateInputs(
+    const size_t thread_id, const int stream_index, const int step_index,
+    InferData& infer_data)
+{
+  for (const auto& input : infer_data.inputs_) {
+    RETURN_IF_ERROR(input->Reset());
+    const auto& model_input = (*(parser_->Inputs()))[input->Name()];
+
+    std::string region_name(
+        TensorToRegionName(input->Name()) + '_' + std::to_string(stream_index) +
+        "_" + std::to_string(step_index));
+
+    std::vector<int64_t> shape;
+    RETURN_IF_ERROR(data_loader_->GetInputShape(
+        model_input, stream_index, step_index, &shape));
+    if (!shape.empty()) {
+      if ((parser_->MaxBatchSize() != 0) && (!model_input.is_shape_tensor_)) {
+        shape.insert(shape.begin(), (int64_t)batch_size_);
+      }
+      input->SetShape(shape);
+    }
+    RETURN_IF_ERROR(input->SetSharedMemory(
+        region_name, shared_memory_regions_[region_name].byte_size_));
+  }
+  return cb::Error::Success;
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data_manager_shm.h
+++ b/src/c++/perf_analyzer/infer_data_manager_shm.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "client_backend/client_backend.h"
+#include "constants.h"
+#include "data_loader.h"
+#include "infer_data.h"
+#include "infer_data_manager_base.h"
+#include "model_parser.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+namespace {
+
+#ifdef TRITON_ENABLE_GPU
+
+#include <cuda_runtime_api.h>
+
+#define RETURN_IF_CUDA_ERR(FUNC)                               \
+  {                                                            \
+    const cudaError_t result = FUNC;                           \
+    if (result != cudaSuccess) {                               \
+      return cb::Error(                                        \
+          "CUDA exception (line " + std::to_string(__LINE__) + \
+              "): " + cudaGetErrorName(result) + " (" +        \
+              cudaGetErrorString(result) + ")",                \
+          pa::GENERIC_ERROR);                                  \
+    }                                                          \
+  }
+
+cb::Error
+CreateCUDAIPCHandle(
+    cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0)
+{
+  // Set the GPU device to the desired GPU
+  RETURN_IF_CUDA_ERR(cudaSetDevice(device_id));
+
+  //  Create IPC handle for data on the gpu
+  RETURN_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr));
+
+  return cb::Error::Success;
+}
+
+#endif  // TRITON_ENABLE_GPU
+
+}  // namespace
+
+/// Holds information about the shared memory locations
+struct SharedMemoryData {
+  SharedMemoryData(
+      size_t byte_size,
+      std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> data)
+      : byte_size_(byte_size), data_(std::move(data))
+  {
+  }
+
+  SharedMemoryData() {}
+
+  // Byte size
+  size_t byte_size_;
+
+  // Unique pointer holding the shared memory data
+  std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> data_;
+};
+
+/// Manages infer data to prepare an inference request and the resulting
+/// inference output from triton server
+class InferDataManagerShm : public InferDataManagerBase {
+ public:
+  InferDataManagerShm(
+      const int32_t batch_size, const SharedMemoryType shared_memory_type,
+      const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      const std::shared_ptr<DataLoader>& data_loader)
+      : shared_memory_type_(shared_memory_type),
+        output_shm_size_(output_shm_size),
+        InferDataManagerBase(batch_size, parser, factory, data_loader)
+  {
+  }
+
+  ~InferDataManagerShm();
+
+  /// Initialize this object. Must be called before any other functions
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Init() override;
+
+ protected:
+  cb::Error CreateOutputMemoryRegions();
+  cb::Error CreateAndPopulateInputMemoryRegions();
+  cb::Error CreateAndPopulateInputMemoryRegion(
+      const std::string& name, const ModelTensor& tensor, int stream_id,
+      int step_id);
+
+  /// Create a memory region.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error CreateMemoryRegion(
+      const std::string& shm_region_name, const SharedMemoryType& memory_type,
+      const size_t byte_size, void** ptr);
+
+  /// \brief Helper function to handle copying shared memory to the correct
+  /// memory region
+  /// \param input_shm_ptr Pointer to the shared memory for a specific input
+  /// \param input_datas The TensorDatas to be copied
+  /// \param is_shape_tensor Is the input a shape tensor
+  /// \param region_name Name of the shared memory region
+  /// \return cb::Error object indicating success or failure
+  virtual cb::Error CopySharedMemory(
+      uint8_t* input_shm_ptr, const std::vector<TensorData>& input_datas,
+      bool is_shape_tensor, std::string& region_name);
+
+  cb::Error InitInferDataInput(
+      const std::string& name, const ModelTensor& model_tensor,
+      InferData& infer_data) override;
+
+  cb::Error InitInferDataOutput(
+      const std::string& name, InferData& infer_data) override;
+
+  /// Helper function to update the inputs
+  /// \param thread_id The ID of the calling thread
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error UpdateInputs(
+      size_t thread_id, const int stream_index, const int step_index,
+      InferData& infer_data) override;
+
+  SharedMemoryType shared_memory_type_;
+  size_t output_shm_size_;
+  // Map from shared memory key to its starting address and size
+  std::unordered_map<std::string, SharedMemoryData> shared_memory_regions_;
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/inference_profiler.cc
+++ b/src/c++/perf_analyzer/inference_profiler.cc
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "inference_profiler.h"
+
+#include <math.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <queue>
+#include <sstream>
+#include <stdexcept>
+
+#include "client_backend/client_backend.h"
+#include "constants.h"
+#include "doctest.h"
+
+namespace triton { namespace perfanalyzer {
+cb::Error
+ReportPrometheusMetrics(const Metrics& metrics)
+{
+  const size_t max_num_gpus_in_stdout{16};
+  if (metrics.gpu_utilization_per_gpu.size() > max_num_gpus_in_stdout ||
+      metrics.gpu_power_usage_per_gpu.size() > max_num_gpus_in_stdout ||
+      metrics.gpu_memory_used_bytes_per_gpu.size() > max_num_gpus_in_stdout ||
+      metrics.gpu_memory_total_bytes_per_gpu.size() > max_num_gpus_in_stdout) {
+    std::cout << "Too many GPUs on system to print out individual Prometheus "
+                 "metrics, use the CSV output feature to see metrics."
+              << std::endl;
+    return cb::Error::Success;
+  }
+
+  std::cout << "    Avg GPU Utilization:" << std::endl;
+  for (const auto& gpu_uuid_metric_pair : metrics.gpu_utilization_per_gpu) {
+    const auto gpu_uuid{gpu_uuid_metric_pair.first};
+    const auto metric{gpu_uuid_metric_pair.second};
+    std::cout << "      " << gpu_uuid << " : " << (metric * 100.0) << "%"
+              << std::endl;
+  }
+
+  std::cout << "    Avg GPU Power Usage:" << std::endl;
+  for (const auto& gpu_uuid_metric_pair : metrics.gpu_power_usage_per_gpu) {
+    const auto gpu_uuid{gpu_uuid_metric_pair.first};
+    const auto metric{gpu_uuid_metric_pair.second};
+    std::cout << "      " << gpu_uuid << " : " << metric << " watts"
+              << std::endl;
+  }
+
+  std::cout << "    Max GPU Memory Usage:" << std::endl;
+  for (const auto& gpu_uuid_metric_pair :
+       metrics.gpu_memory_used_bytes_per_gpu) {
+    const auto gpu_uuid{gpu_uuid_metric_pair.first};
+    const auto metric{gpu_uuid_metric_pair.second};
+    std::cout << "      " << gpu_uuid << " : " << metric << " bytes"
+              << std::endl;
+  }
+
+  std::cout << "    Total GPU Memory:" << std::endl;
+  for (const auto& gpu_uuid_metric_pair :
+       metrics.gpu_memory_total_bytes_per_gpu) {
+    const auto gpu_uuid{gpu_uuid_metric_pair.first};
+    const auto metric{gpu_uuid_metric_pair.second};
+    std::cout << "      " << gpu_uuid << " : " << metric << " bytes"
+              << std::endl;
+  }
+
+  return cb::Error::Success;
+}
+
+namespace {
+
+inline uint64_t
+AverageDurationInUs(const uint64_t total_time_in_ns, const uint64_t cnt)
+{
+  if (cnt == 0) {
+    return 0;
+  }
+  return total_time_in_ns / (cnt * 1000);
+}
+
+EnsembleDurations
+GetTotalEnsembleDurations(const ServerSideStats& stats)
+{
+  EnsembleDurations result;
+  for (const auto& model_stats : stats.composing_models_stat) {
+    if (model_stats.second.composing_models_stat.empty()) {
+      // Cache hit count covers cache hits, not related to compute times
+      const uint64_t cache_hit_cnt = model_stats.second.cache_hit_count;
+      // cache_miss_cnt should either equal infer_cnt or be zero if
+      // cache is disabled or not supported for the model/scheduler type
+      const uint64_t cache_miss_cnt = model_stats.second.cache_miss_count;
+
+      result.total_queue_time_avg_us += AverageDurationInUs(
+          model_stats.second.queue_time_ns, model_stats.second.queue_count);
+      const uint64_t compute_time = model_stats.second.compute_input_time_ns +
+                                    model_stats.second.compute_infer_time_ns +
+                                    model_stats.second.compute_output_time_ns;
+      if (model_stats.second.compute_input_count !=
+              model_stats.second.compute_infer_count ||
+          model_stats.second.compute_infer_count !=
+              model_stats.second.compute_output_count) {
+        throw std::runtime_error(
+            "Server side statistics compute counts must be the same.");
+      }
+      const uint64_t compute_cnt = model_stats.second.compute_input_count;
+      result.total_compute_time_avg_us +=
+          AverageDurationInUs(compute_time, compute_cnt);
+      result.total_cache_hit_time_avg_us += AverageDurationInUs(
+          model_stats.second.cache_hit_time_ns, cache_hit_cnt);
+      result.total_cache_miss_time_avg_us += AverageDurationInUs(
+          model_stats.second.cache_miss_time_ns, cache_miss_cnt);
+      // Track combined cache/compute total avg for reporting latency with cache
+      // enabled
+      result.total_combined_cache_compute_time_avg_us += AverageDurationInUs(
+          compute_time + model_stats.second.cache_hit_time_ns +
+              model_stats.second.cache_miss_time_ns,
+          compute_cnt + cache_hit_cnt);
+    } else {
+      const auto this_ensemble_duration =
+          GetTotalEnsembleDurations(model_stats.second);
+      result.total_queue_time_avg_us +=
+          this_ensemble_duration.total_queue_time_avg_us;
+      result.total_compute_time_avg_us +=
+          this_ensemble_duration.total_compute_time_avg_us;
+      result.total_cache_hit_time_avg_us +=
+          this_ensemble_duration.total_cache_hit_time_avg_us;
+      result.total_cache_miss_time_avg_us +=
+          this_ensemble_duration.total_cache_miss_time_avg_us;
+      result.total_combined_cache_compute_time_avg_us +=
+          this_ensemble_duration.total_combined_cache_compute_time_avg_us;
+    }
+  }
+  return result;
+}
+
+
+size_t
+GetOverheadDuration(size_t total_time, size_t queue_time, size_t compute_time)
+{
+  return (total_time > queue_time + compute_time)
+             ? (total_time - queue_time - compute_time)
+             : 0;
+}
+
+cb::Error
+ReportServerSideStats(
+    const ServerSideStats& stats, const int iteration,
+    const std::shared_ptr<ModelParser>& parser)
+{
+  const std::string ident = std::string(2 * iteration, ' ');
+
+  // Infer/exec counts cover compute time done in inference backends,
+  // not related to cache hit times
+  const uint64_t exec_cnt = stats.execution_count;
+  const uint64_t infer_cnt = stats.inference_count;
+  // Cache hit count covers cache hits, not related to compute times
+  const uint64_t cache_hit_cnt = stats.cache_hit_count;
+  const uint64_t cache_miss_cnt = stats.cache_miss_count;
+
+  // Success count covers all successful requests, cumulative time, queue
+  // time, compute, and cache
+  const uint64_t cnt = stats.success_count;
+  if (cnt == 0) {
+    std::cout << ident << "  Request count: " << cnt << std::endl;
+    return cb::Error::Success;
+  }
+
+  const uint64_t cumm_avg_us = AverageDurationInUs(stats.cumm_time_ns, cnt);
+
+  std::cout << ident << "  Inference count: " << infer_cnt << std::endl
+            << ident << "  Execution count: " << exec_cnt << std::endl;
+  if (parser->ResponseCacheEnabled()) {
+    std::cout << ident << "  Cache hit count: " << cache_hit_cnt << std::endl;
+    std::cout << ident << "  Cache miss count: " << cache_miss_cnt << std::endl;
+  }
+  std::cout << ident << "  Successful request count: " << cnt << std::endl
+            << ident << "  Avg request latency: " << cumm_avg_us << " usec";
+
+  // Non-ensemble model
+  if (stats.composing_models_stat.empty()) {
+    const uint64_t queue_avg_us =
+        AverageDurationInUs(stats.queue_time_ns, stats.queue_count);
+    const uint64_t compute_input_avg_us = AverageDurationInUs(
+        stats.compute_input_time_ns, stats.compute_input_count);
+    const uint64_t compute_infer_avg_us = AverageDurationInUs(
+        stats.compute_infer_time_ns, stats.compute_infer_count);
+    const uint64_t compute_output_avg_us = AverageDurationInUs(
+        stats.compute_output_time_ns, stats.compute_output_count);
+    const uint64_t compute_time = stats.compute_input_time_ns +
+                                  stats.compute_infer_time_ns +
+                                  stats.compute_output_time_ns;
+    if (stats.compute_input_count != stats.compute_infer_count ||
+        stats.compute_infer_count != stats.compute_output_count) {
+      throw std::runtime_error(
+          "Server side statistics compute counts must be the same.");
+    }
+    const uint64_t compute_cnt = stats.compute_input_count;
+    const uint64_t compute_avg_us =
+        AverageDurationInUs(compute_time, compute_cnt);
+    const uint64_t cache_hit_avg_us =
+        AverageDurationInUs(stats.cache_hit_time_ns, cache_hit_cnt);
+    const uint64_t cache_miss_avg_us =
+        AverageDurationInUs(stats.cache_miss_time_ns, cache_miss_cnt);
+    const uint64_t total_compute_time_ns = stats.compute_input_time_ns +
+                                           stats.compute_infer_time_ns +
+                                           stats.compute_output_time_ns;
+    // Get the average of cache hits and misses across successful requests
+    const uint64_t combined_cache_compute_avg_us = AverageDurationInUs(
+        stats.cache_hit_time_ns + stats.cache_miss_time_ns +
+            total_compute_time_ns,
+        compute_cnt + cache_hit_cnt);
+
+    if (parser->ResponseCacheEnabled()) {
+      const uint64_t overhead_avg_us = GetOverheadDuration(
+          cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
+
+      std::cout << " (overhead " << overhead_avg_us << " usec + "
+                << "queue " << queue_avg_us << " usec + "
+                << "cache hit/miss " << combined_cache_compute_avg_us
+                << " usec)" << std::endl;
+      std::cout << ident << ident
+                << "  Average Cache Hit Latency: " << cache_hit_avg_us
+                << " usec" << std::endl;
+      std::cout << ident << ident << "  Average Cache Miss Latency: "
+                << cache_miss_avg_us + compute_avg_us << " usec "
+                << "(cache lookup/insertion " << cache_miss_avg_us << " usec + "
+                << "compute input " << compute_input_avg_us << " usec + "
+                << "compute infer " << compute_infer_avg_us << " usec + "
+                << "compute output " << compute_output_avg_us << " usec)"
+                << std::endl
+                << std::endl;
+    }
+    // Response Cache Disabled
+    else {
+      std::cout << " (overhead "
+                << GetOverheadDuration(
+                       cumm_avg_us, queue_avg_us, compute_avg_us)
+                << " usec + "
+                << "queue " << queue_avg_us << " usec + "
+                << "compute input " << compute_input_avg_us << " usec + "
+                << "compute infer " << compute_infer_avg_us << " usec + "
+                << "compute output " << compute_output_avg_us << " usec)"
+                << std::endl
+                << std::endl;
+
+      if (cache_hit_avg_us > 0 || cache_miss_avg_us > 0) {
+        std::cerr << "Response Cache is disabled for model ["
+                  << parser->ModelName()
+                  << "] but cache hit/miss latency is non-zero." << std::endl;
+      }
+    }
+  }
+  // Ensemble Model
+  else {
+    const auto ensemble_times = GetTotalEnsembleDurations(stats);
+    // Response Cache Enabled
+    if (parser->ResponseCacheEnabled()) {
+      const uint64_t overhead_avg_us = GetOverheadDuration(
+          cumm_avg_us, ensemble_times.total_queue_time_avg_us,
+          ensemble_times.total_combined_cache_compute_time_avg_us);
+      std::cout << " (overhead " << overhead_avg_us << " usec + "
+                << "queue " << ensemble_times.total_queue_time_avg_us
+                << " usec + "
+                << "cache hit/miss "
+                << ensemble_times.total_combined_cache_compute_time_avg_us
+                << " usec)" << std::endl;
+      std::cout << ident << ident << "  Average Cache Hit Latency: "
+                << ensemble_times.total_cache_hit_time_avg_us << " usec"
+                << std::endl;
+      std::cout << ident << ident << "  Average Cache Miss Latency: "
+                << ensemble_times.total_cache_miss_time_avg_us +
+                       ensemble_times.total_compute_time_avg_us
+                << " usec " << std::endl
+                << std::endl;
+    }
+    // Response Cache Disabled
+    else {
+      std::cout << " (overhead "
+                << GetOverheadDuration(
+                       cumm_avg_us, ensemble_times.total_queue_time_avg_us,
+                       ensemble_times.total_compute_time_avg_us)
+                << " usec + "
+                << "queue " << ensemble_times.total_queue_time_avg_us
+                << " usec + "
+                << "compute " << ensemble_times.total_compute_time_avg_us
+                << " usec)" << std::endl
+                << std::endl;
+    }
+
+    // List out composing models of ensemble model
+    std::cout << ident << "Composing models: " << std::endl;
+    for (const auto& model_stats : stats.composing_models_stat) {
+      const auto& model_identifier = model_stats.first;
+      std::cout << ident << model_identifier.first
+                << ", version: " << model_identifier.second << std::endl;
+      ReportServerSideStats(model_stats.second, iteration + 1, parser);
+    }
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+ReportClientSideStats(
+    const ClientSideStats& stats, const int64_t percentile,
+    const cb::ProtocolType protocol, const bool verbose,
+    const bool on_sequence_model, const bool include_lib_stats,
+    const double overhead_pct, const double send_request_rate,
+    const bool is_decoupled_model)
+{
+  const uint64_t avg_latency_us = stats.avg_latency_ns / 1000;
+  const uint64_t std_us = stats.std_us;
+
+  const uint64_t avg_request_time_us = stats.avg_request_time_ns / 1000;
+  const uint64_t avg_send_time_us = stats.avg_send_time_ns / 1000;
+  const uint64_t avg_receive_time_us = stats.avg_receive_time_ns / 1000;
+  const uint64_t avg_response_wait_time_us =
+      avg_request_time_us - avg_send_time_us - avg_receive_time_us;
+
+  std::string client_library_detail = "    ";
+  if (include_lib_stats) {
+    if (protocol == cb::ProtocolType::GRPC) {
+      client_library_detail +=
+          "Avg gRPC time: " + std::to_string(avg_request_time_us) + " usec (";
+      if (!verbose) {
+        client_library_detail +=
+            "(un)marshal request/response " +
+            std::to_string(avg_send_time_us + avg_receive_time_us) +
+            " usec + response wait " +
+            std::to_string(avg_response_wait_time_us) + " usec)";
+      } else {
+        client_library_detail += "marshal " + std::to_string(avg_send_time_us) +
+                                 " usec + response wait " +
+                                 std::to_string(avg_response_wait_time_us) +
+                                 " usec + unmarshal " +
+                                 std::to_string(avg_receive_time_us) + " usec)";
+      }
+    } else if (protocol == cb::ProtocolType::HTTP) {
+      client_library_detail +=
+          "Avg HTTP time: " + std::to_string(avg_request_time_us) + " usec (";
+      if (!verbose) {
+        client_library_detail +=
+            "send/recv " +
+            std::to_string(avg_send_time_us + avg_receive_time_us) +
+            " usec + response wait " +
+            std::to_string(avg_response_wait_time_us) + " usec)";
+      } else {
+        client_library_detail += "send " + std::to_string(avg_send_time_us) +
+                                 " usec + response wait " +
+                                 std::to_string(avg_response_wait_time_us) +
+                                 " usec + receive " +
+                                 std::to_string(avg_receive_time_us) + " usec)";
+      }
+    }
+  }
+
+  std::cout << "    Request count: " << stats.request_count << std::endl;
+  double delay_pct =
+      ((double)stats.delayed_request_count / stats.request_count) * 100;
+  if (delay_pct > DELAY_PCT_THRESHOLD) {
+    std::cout << "    "
+              << "Avg send request rate: " << std::fixed << std::setprecision(2)
+              << send_request_rate << " infer/sec" << std::endl;
+    std::cout << "    "
+              << "[WARNING] Perf Analyzer was not able to keep up with the "
+                 "desired request rate. ";
+    std::cout << delay_pct << "% of the requests were delayed. " << std::endl;
+  }
+  if (on_sequence_model) {
+    std::cout << "    Sequence count: " << stats.sequence_count << " ("
+              << stats.sequence_per_sec << " seq/sec)" << std::endl;
+  }
+  std::cout << "    Throughput: " << stats.infer_per_sec << " infer/sec"
+            << std::endl;
+  if (is_decoupled_model) {
+    std::cout << "    Response Throughput: " << stats.responses_per_sec
+              << " infer/sec" << std::endl;
+  }
+
+  if (verbose) {
+    std::stringstream client_overhead{""};
+    client_overhead << "    "
+                    << "Avg client overhead: " << std::fixed
+                    << std::setprecision(2) << overhead_pct << "%";
+    std::cout << client_overhead.str() << std::endl;
+  }
+
+  if (percentile == -1) {
+    std::cout << "    Avg latency: " << avg_latency_us << " usec"
+              << " (standard deviation " << std_us << " usec)" << std::endl;
+  }
+  for (const auto& percentile : stats.percentile_latency_ns) {
+    std::cout << "    p" << percentile.first
+              << " latency: " << (percentile.second / 1000) << " usec"
+              << std::endl;
+  }
+
+  std::cout << client_library_detail << std::endl;
+
+  return cb::Error::Success;
+}
+
+cb::Error
+Report(
+    const PerfStatus& summary, const int64_t percentile,
+    const cb::ProtocolType protocol, const bool verbose,
+    const bool include_lib_stats, const bool include_server_stats,
+    const std::shared_ptr<ModelParser>& parser,
+    const bool should_collect_metrics, const double overhead_pct_threshold)
+{
+  std::cout << "  Client: " << std::endl;
+  ReportClientSideStats(
+      summary.client_stats, percentile, protocol, verbose,
+      summary.on_sequence_model, include_lib_stats, summary.overhead_pct,
+      summary.send_request_rate, parser->IsDecoupled());
+
+  if (include_server_stats) {
+    std::cout << "  Server: " << std::endl;
+    ReportServerSideStats(summary.server_stats, 1, parser);
+  }
+
+  if (should_collect_metrics) {
+    std::cout << "  Server Prometheus Metrics: " << std::endl;
+    ReportPrometheusMetrics(summary.metrics.front());
+  }
+
+  if (summary.overhead_pct > overhead_pct_threshold) {
+    std::cout << "[WARNING] Perf Analyzer is not able to keep up with the "
+                 "desired load. The results may not be accurate."
+              << std::endl;
+  }
+  return cb::Error::Success;
+}
+
+}  // namespace
+
+cb::Error
+InferenceProfiler::Create(
+    const bool verbose, const double stability_threshold,
+    const uint64_t measurement_window_ms, const size_t max_trials,
+    const int64_t percentile, const uint64_t latency_threshold_ms_,
+    const cb::ProtocolType protocol, std::shared_ptr<ModelParser>& parser,
+    std::shared_ptr<cb::ClientBackend> profile_backend,
+    std::unique_ptr<LoadManager> manager,
+    std::unique_ptr<InferenceProfiler>* profiler,
+    uint64_t measurement_request_count, MeasurementMode measurement_mode,
+    std::shared_ptr<MPIDriver> mpi_driver, const uint64_t metrics_interval_ms,
+    const bool should_collect_metrics, const double overhead_pct_threshold,
+    const std::shared_ptr<ProfileDataCollector> collector,
+    const bool should_collect_profile_data)
+{
+  std::unique_ptr<InferenceProfiler> local_profiler(new InferenceProfiler(
+      verbose, stability_threshold, measurement_window_ms, max_trials,
+      (percentile != -1), percentile, latency_threshold_ms_, protocol, parser,
+      profile_backend, std::move(manager), measurement_request_count,
+      measurement_mode, mpi_driver, metrics_interval_ms, should_collect_metrics,
+      overhead_pct_threshold, collector, should_collect_profile_data));
+
+  *profiler = std::move(local_profiler);
+  return cb::Error::Success;
+}
+
+InferenceProfiler::InferenceProfiler(
+    const bool verbose, const double stability_threshold,
+    const int32_t measurement_window_ms, const size_t max_trials,
+    const bool extra_percentile, const size_t percentile,
+    const uint64_t latency_threshold_ms_, const cb::ProtocolType protocol,
+    std::shared_ptr<ModelParser>& parser,
+    std::shared_ptr<cb::ClientBackend> profile_backend,
+    std::unique_ptr<LoadManager> manager, uint64_t measurement_request_count,
+    MeasurementMode measurement_mode, std::shared_ptr<MPIDriver> mpi_driver,
+    const uint64_t metrics_interval_ms, const bool should_collect_metrics,
+    const double overhead_pct_threshold,
+    const std::shared_ptr<ProfileDataCollector> collector,
+    const bool should_collect_profile_data)
+    : verbose_(verbose), measurement_window_ms_(measurement_window_ms),
+      max_trials_(max_trials), extra_percentile_(extra_percentile),
+      percentile_(percentile), latency_threshold_ms_(latency_threshold_ms_),
+      protocol_(protocol), parser_(parser), profile_backend_(profile_backend),
+      manager_(std::move(manager)),
+      measurement_request_count_(measurement_request_count),
+      measurement_mode_(measurement_mode), mpi_driver_(mpi_driver),
+      should_collect_metrics_(should_collect_metrics),
+      overhead_pct_threshold_(overhead_pct_threshold), collector_(collector),
+      should_collect_profile_data_(should_collect_profile_data)
+{
+  load_parameters_.stability_threshold = stability_threshold;
+  load_parameters_.stability_window = 3;
+  if (profile_backend_->Kind() == cb::BackendKind::TRITON ||
+      profile_backend_->Kind() == cb::BackendKind::TRITON_C_API) {
+    // Measure and report client library stats only when the model
+    // is not decoupled.
+    include_lib_stats_ = (!parser_->IsDecoupled());
+    // Measure and report server statistics only when the server
+    // supports the statistics extension.
+    std::set<std::string> extensions;
+    profile_backend_->ServerExtensions(&extensions);
+    include_server_stats_ = (extensions.find("statistics") != extensions.end());
+  } else {
+    include_lib_stats_ = true;
+    include_server_stats_ = false;
+  }
+  if (should_collect_metrics_) {
+    metrics_manager_ =
+        std::make_shared<MetricsManager>(profile_backend, metrics_interval_ms);
+  }
+}
+
+cb::Error
+InferenceProfiler::Profile(
+    const size_t concurrent_request_count,
+    std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
+    bool& is_stable)
+{
+  cb::Error err;
+  PerfStatus perf_status{};
+
+  perf_status.concurrency = concurrent_request_count;
+
+  is_stable = false;
+  meets_threshold = true;
+
+  RETURN_IF_ERROR(dynamic_cast<ConcurrencyManager*>(manager_.get())
+                      ->ChangeConcurrencyLevel(concurrent_request_count));
+
+  err = ProfileHelper(perf_status, &is_stable);
+  if (err.IsOk()) {
+    uint64_t stabilizing_latency_ms =
+        perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
+    if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
+        (latency_threshold_ms_ != NO_LIMIT)) {
+      std::cerr << "Measured latency went over the set limit of "
+                << latency_threshold_ms_ << " msec. " << std::endl;
+      meets_threshold = false;
+    } else if (!is_stable) {
+      if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) {
+        std::cerr << "Failed to obtain stable measurement within "
+                  << max_trials_ << " measurement windows for concurrency "
+                  << concurrent_request_count << ". Please try to "
+                  << "increase the --measurement-interval." << std::endl;
+      } else if (measurement_mode_ == MeasurementMode::COUNT_WINDOWS) {
+        std::cerr << "Failed to obtain stable measurement within "
+                  << max_trials_ << " measurement windows for concurrency "
+                  << concurrent_request_count << ". Please try to "
+                  << "increase the --measurement-request-count." << std::endl;
+      }
+      meets_threshold = false;
+    } else {
+      perf_statuses.push_back(perf_status);
+      err = Report(
+          perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
+          include_server_stats_, parser_, should_collect_metrics_,
+          overhead_pct_threshold_);
+      if (!err.IsOk()) {
+        std::cerr << err;
+        meets_threshold = false;
+      }
+    }
+  } else {
+    return err;
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::Profile(
+    const double request_rate, std::vector<PerfStatus>& perf_statuses,
+    bool& meets_threshold, bool& is_stable)
+{
+  cb::Error err;
+  PerfStatus perf_status{};
+
+  perf_status.request_rate = request_rate;
+
+  is_stable = false;
+  meets_threshold = true;
+
+  RETURN_IF_ERROR(dynamic_cast<RequestRateManager*>(manager_.get())
+                      ->ChangeRequestRate(request_rate));
+  std::cout << "Request Rate: " << request_rate
+            << " inference requests per seconds" << std::endl;
+
+  err = ProfileHelper(perf_status, &is_stable);
+  if (err.IsOk()) {
+    uint64_t stabilizing_latency_ms =
+        perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
+    if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
+        (latency_threshold_ms_ != NO_LIMIT)) {
+      std::cerr << "Measured latency went over the set limit of "
+                << latency_threshold_ms_ << " msec. " << std::endl;
+      meets_threshold = false;
+    } else if (!is_stable) {
+      std::cerr << "Failed to obtain stable measurement." << std::endl;
+      meets_threshold = false;
+    } else {
+      perf_statuses.push_back(perf_status);
+      err = Report(
+          perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
+          include_server_stats_, parser_, should_collect_metrics_,
+          overhead_pct_threshold_);
+      if (!err.IsOk()) {
+        std::cerr << err;
+        meets_threshold = false;
+      }
+    }
+  } else {
+    return err;
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::Profile(
+    std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
+    bool& is_stable)
+{
+  cb::Error err;
+  PerfStatus perf_status{};
+
+  RETURN_IF_ERROR(
+      dynamic_cast<CustomLoadManager*>(manager_.get())->InitCustomIntervals());
+  RETURN_IF_ERROR(dynamic_cast<CustomLoadManager*>(manager_.get())
+                      ->GetCustomRequestRate(&perf_status.request_rate));
+
+  is_stable = false;
+  meets_threshold = true;
+
+  err = ProfileHelper(perf_status, &is_stable);
+  if (err.IsOk()) {
+    uint64_t stabilizing_latency_ms =
+        perf_status.stabilizing_latency_ns / NANOS_PER_MILLIS;
+    if ((stabilizing_latency_ms >= latency_threshold_ms_) &&
+        (latency_threshold_ms_ != NO_LIMIT)) {
+      std::cerr << "Measured latency went over the set limit of "
+                << latency_threshold_ms_ << " msec. " << std::endl;
+      meets_threshold = false;
+    } else if (!is_stable) {
+      std::cerr << "Failed to obtain stable measurement." << std::endl;
+      meets_threshold = false;
+    } else {
+      perf_statuses.push_back(perf_status);
+      err = Report(
+          perf_status, percentile_, protocol_, verbose_, include_lib_stats_,
+          include_server_stats_, parser_, should_collect_metrics_,
+          overhead_pct_threshold_);
+      if (!err.IsOk()) {
+        std::cerr << err;
+        meets_threshold = false;
+      }
+    }
+  } else {
+    return err;
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::ProfileHelper(
+    PerfStatus& experiment_perf_status, bool* is_stable)
+{
+  // Start measurement
+  LoadStatus load_status;
+  size_t completed_trials = 0;
+  std::queue<cb::Error> error;
+  std::deque<PerfStatus> measurement_perf_statuses;
+  all_request_records_.clear();
+  previous_window_end_ns_ = 0;
+
+  // Start with a fresh empty request records vector in the manager
+  //
+  std::vector<RequestRecord> empty_request_records;
+  RETURN_IF_ERROR(manager_->SwapRequestRecords(empty_request_records));
+
+  do {
+    PerfStatus measurement_perf_status;
+    measurement_perf_status.concurrency = experiment_perf_status.concurrency;
+    measurement_perf_status.request_rate = experiment_perf_status.request_rate;
+    RETURN_IF_ERROR(manager_->CheckHealth());
+
+    if (measurement_mode_ == MeasurementMode::TIME_WINDOWS) {
+      error.push(
+          Measure(measurement_perf_status, measurement_window_ms_, false));
+    } else {
+      error.push(
+          Measure(measurement_perf_status, measurement_request_count_, true));
+    }
+    measurement_perf_statuses.push_back(measurement_perf_status);
+
+    if (error.size() > load_parameters_.stability_window) {
+      error.pop();
+      measurement_perf_statuses.pop_front();
+    }
+
+    if (error.back().IsOk()) {
+      load_status.infer_per_sec.push_back(
+          measurement_perf_status.client_stats.infer_per_sec);
+      load_status.latencies.push_back(
+          measurement_perf_status.stabilizing_latency_ns);
+    } else {
+      load_status.infer_per_sec.push_back(0);
+      load_status.latencies.push_back(std::numeric_limits<uint64_t>::max());
+    }
+
+    load_status.avg_ips +=
+        load_status.infer_per_sec.back() / load_parameters_.stability_window;
+    load_status.avg_latency +=
+        load_status.latencies.back() / load_parameters_.stability_window;
+    if (verbose_) {
+      if (error.back().IsOk()) {
+        std::cout << "  Pass [" << (completed_trials + 1)
+                  << "] throughput: " << load_status.infer_per_sec.back()
+                  << " infer/sec. ";
+        if (extra_percentile_) {
+          std::cout << "p" << percentile_ << " latency: "
+                    << (measurement_perf_status.client_stats
+                            .percentile_latency_ns.find(percentile_)
+                            ->second /
+                        1000)
+                    << " usec" << std::endl;
+        } else {
+          std::cout << "Avg latency: "
+                    << (measurement_perf_status.client_stats.avg_latency_ns /
+                        1000)
+                    << " usec (std "
+                    << measurement_perf_status.client_stats.std_us << " usec). "
+                    << std::endl;
+        }
+      } else {
+        std::cout << "  Pass [" << (completed_trials + 1)
+                  << "] cb::Error: " << error.back().Message() << std::endl;
+      }
+    }
+
+    *is_stable = DetermineStability(load_status);
+
+    if (IsDoneProfiling(load_status, is_stable)) {
+      break;
+    }
+
+    completed_trials++;
+  } while ((!early_exit) && (completed_trials < max_trials_));
+
+  if (should_collect_metrics_) {
+    metrics_manager_->StopQueryingMetrics();
+  }
+
+  // return the appropriate error which might have occurred in the
+  // stability_window for its proper handling.
+  while (!error.empty()) {
+    if (!error.front().IsOk()) {
+      return error.front();
+    } else {
+      error.pop();
+    }
+  }
+
+  // Only merge the results if the results have stabilized.
+  if (*is_stable) {
+    RETURN_IF_ERROR(MergePerfStatusReports(
+        measurement_perf_statuses, experiment_perf_status));
+  }
+
+  if (early_exit) {
+    return cb::Error("Received exit signal.", pa::GENERIC_ERROR);
+  }
+  return cb::Error::Success;
+}
+
+bool
+InferenceProfiler::DetermineStability(LoadStatus& load_status)
+{
+  bool stable = false;
+  if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) {
+    stable = true;
+    size_t idx =
+        load_status.infer_per_sec.size() - load_parameters_.stability_window;
+
+    for (size_t i = idx; i < load_status.infer_per_sec.size(); i++) {
+      if (load_status.infer_per_sec[i] == 0) {
+        stable = false;
+      }
+    }
+
+    stable = stable && CheckWindowForStability(idx, load_status);
+  }
+  return stable;
+}
+
+bool
+InferenceProfiler::CheckWindowForStability(size_t idx, LoadStatus& load_status)
+{
+  return IsInferWindowStable(idx, load_status) &&
+         IsLatencyWindowStable(idx, load_status);
+}
+
+bool
+InferenceProfiler::IsInferWindowStable(size_t idx, LoadStatus& load_status)
+{
+  auto infer_start = std::begin(load_status.infer_per_sec) + idx;
+  auto infer_per_sec_measurements = std::minmax_element(
+      infer_start, infer_start + load_parameters_.stability_window);
+
+  auto max_infer_per_sec = *infer_per_sec_measurements.second;
+  auto min_infer_per_sec = *infer_per_sec_measurements.first;
+
+  return max_infer_per_sec / min_infer_per_sec <=
+         1 + load_parameters_.stability_threshold;
+}
+
+bool
+InferenceProfiler::IsLatencyWindowStable(size_t idx, LoadStatus& load_status)
+{
+  auto latency_start = std::begin(load_status.latencies) + idx;
+  auto latencies_per_sec_measurements = std::minmax_element(
+      latency_start, latency_start + load_parameters_.stability_window);
+
+  double max_latency = *latencies_per_sec_measurements.second;
+  double min_latency = *latencies_per_sec_measurements.first;
+
+  return max_latency / min_latency <= 1 + load_parameters_.stability_threshold;
+}
+
+bool
+InferenceProfiler::IsDoneProfiling(LoadStatus& load_status, bool* is_stable)
+{
+  bool done = false;
+  bool within_threshold = true;
+  if (load_status.infer_per_sec.size() >= load_parameters_.stability_window) {
+    size_t idx =
+        load_status.infer_per_sec.size() - load_parameters_.stability_window;
+
+    for (; idx < load_status.infer_per_sec.size(); idx++) {
+      within_threshold &= CheckWithinThreshold(idx, load_status);
+    }
+  }
+
+  if (mpi_driver_->IsMPIRun()) {
+    if (AllMPIRanksAreStable(*is_stable)) {
+      done = true;
+    }
+  } else if (*is_stable) {
+    done = true;
+  }
+  if ((!within_threshold) && (latency_threshold_ms_ != NO_LIMIT)) {
+    done = true;
+  }
+  return done;
+}
+
+bool
+InferenceProfiler::CheckWithinThreshold(size_t idx, LoadStatus& load_status)
+{
+  return load_status.latencies[idx] <
+         (latency_threshold_ms_ * NANOS_PER_MILLIS);
+}
+
+cb::Error
+InferenceProfiler::MergeServerSideStats(
+    std::vector<ServerSideStats>& server_side_stats,
+    ServerSideStats& server_side_summary)
+{
+  auto& server_side_stat = server_side_stats[0];
+
+  // Make sure that the perf status reports profiling settings match with each
+  // other.
+  for (size_t i = 1; i < server_side_stats.size(); i++) {
+    if (server_side_stats[i].composing_models_stat.size() !=
+        server_side_stat.composing_models_stat.size()) {
+      return cb::Error(
+          "Inconsistent ensemble setting detected between the trials.",
+          pa::GENERIC_ERROR);
+    }
+  }
+
+  // Initialize the server stats for the merged report.
+  server_side_summary.inference_count = 0;
+  server_side_summary.execution_count = 0;
+  server_side_summary.cache_hit_count = 0;
+  server_side_summary.cache_miss_count = 0;
+  server_side_summary.success_count = 0;
+  server_side_summary.queue_count = 0;
+  server_side_summary.compute_input_count = 0;
+  server_side_summary.compute_output_count = 0;
+  server_side_summary.compute_infer_count = 0;
+  server_side_summary.cumm_time_ns = 0;
+  server_side_summary.queue_time_ns = 0;
+  server_side_summary.compute_input_time_ns = 0;
+  server_side_summary.compute_infer_time_ns = 0;
+  server_side_summary.compute_output_time_ns = 0;
+  server_side_summary.cache_hit_time_ns = 0;
+  server_side_summary.cache_miss_time_ns = 0;
+  server_side_summary.composing_models_stat.clear();
+  for (auto& composing_model_stat : server_side_stat.composing_models_stat) {
+    std::vector<ServerSideStats> composing_model_stats;
+    for (auto& server_side_stat : server_side_stats) {
+      composing_model_stats.push_back(
+          server_side_stat.composing_models_stat[composing_model_stat.first]);
+    }
+
+    ServerSideStats merged_composing_model_stats;
+    RETURN_IF_ERROR(MergeServerSideStats(
+        composing_model_stats, merged_composing_model_stats));
+    server_side_summary.composing_models_stat.insert(
+        {composing_model_stat.first, merged_composing_model_stats});
+  }
+
+  for (auto& server_side_stat : server_side_stats) {
+    // Aggregated Server Stats
+    server_side_summary.inference_count += server_side_stat.inference_count;
+    server_side_summary.execution_count += server_side_stat.execution_count;
+    server_side_summary.cache_hit_count += server_side_stat.cache_hit_count;
+    server_side_summary.cache_miss_count += server_side_stat.cache_miss_count;
+    server_side_summary.success_count += server_side_stat.success_count;
+    server_side_summary.queue_count += server_side_stat.queue_count;
+    server_side_summary.compute_input_count +=
+        server_side_stat.compute_input_count;
+    server_side_summary.compute_infer_count +=
+        server_side_stat.compute_infer_count;
+    server_side_summary.compute_output_count +=
+        server_side_stat.compute_output_count;
+    server_side_summary.cumm_time_ns += server_side_stat.cumm_time_ns;
+    server_side_summary.queue_time_ns += server_side_stat.queue_time_ns;
+    server_side_summary.compute_input_time_ns +=
+        server_side_stat.compute_input_time_ns;
+    server_side_summary.compute_infer_time_ns +=
+        server_side_stat.compute_infer_time_ns;
+    server_side_summary.compute_output_time_ns +=
+        server_side_stat.compute_output_time_ns;
+    server_side_summary.cache_hit_time_ns += server_side_stat.cache_hit_time_ns;
+    server_side_summary.cache_miss_time_ns +=
+        server_side_stat.cache_miss_time_ns;
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::MergePerfStatusReports(
+    std::deque<PerfStatus>& perf_status_reports,
+    PerfStatus& experiment_perf_status)
+{
+  auto& perf_status = perf_status_reports[0];
+
+  // Make sure that the perf status reports profiling settings match with each
+  // other.
+  for (size_t i = 1; i < perf_status_reports.size(); i++) {
+    perf_status.concurrency = experiment_perf_status.concurrency;
+    perf_status.request_rate = experiment_perf_status.request_rate;
+
+    if (perf_status_reports[i].on_sequence_model !=
+        perf_status.on_sequence_model) {
+      return cb::Error(
+          "Inconsistent sequence setting detected.", pa::GENERIC_ERROR);
+    }
+
+    if (perf_status_reports[i].batch_size != perf_status.batch_size) {
+      return cb::Error("Inconsistent batch size detected.", pa::GENERIC_ERROR);
+    }
+
+    if (perf_status_reports[i].server_stats.composing_models_stat.size() !=
+        perf_status.server_stats.composing_models_stat.size()) {
+      return cb::Error(
+          "Inconsistent ensemble setting detected between the trials.",
+          pa::GENERIC_ERROR);
+    }
+  }
+
+  experiment_perf_status.batch_size = perf_status.batch_size;
+  experiment_perf_status.on_sequence_model = perf_status.on_sequence_model;
+
+  // Initialize the client stats for the merged report.
+  experiment_perf_status.client_stats.request_count = 0;
+  experiment_perf_status.client_stats.sequence_count = 0;
+  experiment_perf_status.client_stats.delayed_request_count = 0;
+  experiment_perf_status.client_stats.duration_ns = 0;
+  experiment_perf_status.client_stats.avg_latency_ns = 0;
+  experiment_perf_status.client_stats.percentile_latency_ns.clear();
+  experiment_perf_status.client_stats.latencies.clear();
+  experiment_perf_status.client_stats.std_us = 0;
+  experiment_perf_status.client_stats.avg_request_time_ns = 0;
+  experiment_perf_status.client_stats.avg_send_time_ns = 0;
+  experiment_perf_status.client_stats.avg_receive_time_ns = 0;
+  experiment_perf_status.client_stats.infer_per_sec = 0;
+  experiment_perf_status.client_stats.sequence_per_sec = 0;
+  experiment_perf_status.client_stats.completed_count = 0;
+  experiment_perf_status.stabilizing_latency_ns = 0;
+  experiment_perf_status.overhead_pct = 0;
+  experiment_perf_status.send_request_rate = 0.0;
+
+  std::vector<ServerSideStats> server_side_stats;
+  for (auto& perf_status : perf_status_reports) {
+    // Aggregated Client Stats
+    experiment_perf_status.client_stats.request_count +=
+        perf_status.client_stats.request_count;
+    experiment_perf_status.client_stats.sequence_count +=
+        perf_status.client_stats.sequence_count;
+    experiment_perf_status.client_stats.delayed_request_count +=
+        perf_status.client_stats.delayed_request_count;
+    experiment_perf_status.client_stats.response_count +=
+        perf_status.client_stats.response_count;
+    experiment_perf_status.client_stats.duration_ns +=
+        perf_status.client_stats.duration_ns;
+
+    server_side_stats.push_back(perf_status.server_stats);
+
+    experiment_perf_status.client_stats.latencies.insert(
+        experiment_perf_status.client_stats.latencies.end(),
+        perf_status.client_stats.latencies.begin(),
+        perf_status.client_stats.latencies.end());
+    // Accumulate the overhead percentage and send rate here to remove extra
+    // traversals over the perf_status_reports
+    experiment_perf_status.overhead_pct += perf_status.overhead_pct;
+    experiment_perf_status.send_request_rate += perf_status.send_request_rate;
+  }
+
+  // Calculate the average overhead_pct for the experiment.
+  experiment_perf_status.overhead_pct /= perf_status_reports.size();
+  experiment_perf_status.send_request_rate /= perf_status_reports.size();
+
+  if (include_lib_stats_) {
+    for (auto& perf_status : perf_status_reports) {
+      experiment_perf_status.client_stats.completed_count +=
+          perf_status.client_stats.completed_count;
+
+      experiment_perf_status.client_stats.avg_request_time_ns +=
+          perf_status.client_stats.avg_request_time_ns *
+          perf_status.client_stats.completed_count;
+
+      experiment_perf_status.client_stats.avg_send_time_ns +=
+          perf_status.client_stats.avg_send_time_ns *
+          perf_status.client_stats.completed_count;
+
+      experiment_perf_status.client_stats.avg_receive_time_ns +=
+          perf_status.client_stats.avg_receive_time_ns *
+          perf_status.client_stats.completed_count;
+    }
+
+    if (experiment_perf_status.client_stats.completed_count != 0) {
+      experiment_perf_status.client_stats.avg_request_time_ns =
+          experiment_perf_status.client_stats.avg_request_time_ns /
+          experiment_perf_status.client_stats.completed_count;
+
+      experiment_perf_status.client_stats.avg_send_time_ns =
+          experiment_perf_status.client_stats.avg_send_time_ns /
+          experiment_perf_status.client_stats.completed_count;
+
+      experiment_perf_status.client_stats.avg_receive_time_ns =
+          experiment_perf_status.client_stats.avg_receive_time_ns /
+          experiment_perf_status.client_stats.completed_count;
+    }
+  }
+
+  RETURN_IF_ERROR(MergeServerSideStats(
+      server_side_stats, experiment_perf_status.server_stats));
+
+  std::sort(
+      experiment_perf_status.client_stats.latencies.begin(),
+      experiment_perf_status.client_stats.latencies.end());
+
+  float client_duration_sec =
+      (float)experiment_perf_status.client_stats.duration_ns / NANOS_PER_SECOND;
+  experiment_perf_status.client_stats.sequence_per_sec =
+      experiment_perf_status.client_stats.sequence_count / client_duration_sec;
+  experiment_perf_status.client_stats.infer_per_sec =
+      (experiment_perf_status.client_stats.request_count *
+       experiment_perf_status.batch_size) /
+      client_duration_sec;
+  experiment_perf_status.client_stats.responses_per_sec =
+      experiment_perf_status.client_stats.response_count / client_duration_sec;
+  RETURN_IF_ERROR(SummarizeLatency(
+      experiment_perf_status.client_stats.latencies, experiment_perf_status));
+
+  if (should_collect_metrics_) {
+    // Put all Metric objects in a flat vector so they're easier to merge
+    std::vector<std::reference_wrapper<const Metrics>> all_metrics{};
+    std::for_each(
+        perf_status_reports.begin(), perf_status_reports.end(),
+        [&all_metrics](const PerfStatus& p) {
+          std::for_each(
+              p.metrics.begin(), p.metrics.end(),
+              [&all_metrics](const Metrics& m) { all_metrics.push_back(m); });
+        });
+
+    Metrics merged_metrics{};
+    RETURN_IF_ERROR(MergeMetrics(all_metrics, merged_metrics));
+    experiment_perf_status.metrics.push_back(std::move(merged_metrics));
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::GetServerSideStatus(
+    std::map<cb::ModelIdentifier, cb::ModelStatistics>* model_stats)
+{
+  if ((parser_->SchedulerType() == ModelParser::ENSEMBLE) ||
+      (parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE)) {
+    RETURN_IF_ERROR(profile_backend_->ModelInferenceStatistics(model_stats));
+  } else {
+    RETURN_IF_ERROR(profile_backend_->ModelInferenceStatistics(
+        model_stats, parser_->ModelName(), parser_->ModelVersion()));
+  }
+  return cb::Error::Success;
+}
+
+// Used for measurement
+cb::Error
+InferenceProfiler::Measure(
+    PerfStatus& perf_status, uint64_t measurement_window, bool is_count_based)
+{
+  std::map<cb::ModelIdentifier, cb::ModelStatistics> start_status;
+  std::map<cb::ModelIdentifier, cb::ModelStatistics> end_status;
+  cb::InferStat start_stat;
+  cb::InferStat end_stat;
+
+  manager_->ResetIdleTime();
+
+  // Set current window start time to end of previous window. For first
+  // measurement window, capture start time, server side stats, and client side
+  // stats.
+  uint64_t window_start_ns = previous_window_end_ns_;
+  start_stat = prev_client_side_stats_;
+  start_status = prev_server_side_stats_;
+  if (window_start_ns == 0) {
+    window_start_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                          std::chrono::system_clock::now().time_since_epoch())
+                          .count();
+    if (should_collect_metrics_) {
+      metrics_manager_->StartQueryingMetrics();
+    }
+    if (include_server_stats_) {
+      RETURN_IF_ERROR(GetServerSideStatus(&start_status));
+    }
+    RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&start_stat));
+  }
+
+  if (should_collect_metrics_) {
+    try {
+      metrics_manager_->CheckQueryingStatus();
+    }
+    catch (const std::exception& e) {
+      return cb::Error(e.what(), pa::GENERIC_ERROR);
+    }
+  }
+
+  if (!is_count_based) {
+    // Wait for specified time interval in msec
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds((uint64_t)(measurement_window_ms_ * 1.2)));
+  } else {
+    do {
+      // Check the health of the worker threads.
+      RETURN_IF_ERROR(manager_->CheckHealth());
+
+      // Wait for 1s until enough samples have been collected.
+      std::this_thread::sleep_for(std::chrono::milliseconds((uint64_t)1000));
+    } while (manager_->CountCollectedRequests() < measurement_window);
+  }
+
+  uint64_t window_end_ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::system_clock::now().time_since_epoch())
+          .count();
+  previous_window_end_ns_ = window_end_ns;
+
+  if (should_collect_metrics_) {
+    metrics_manager_->GetLatestMetrics(perf_status.metrics);
+  }
+
+  // Get server status and then print report on difference between
+  // before and after status.
+  if (include_server_stats_) {
+    RETURN_IF_ERROR(GetServerSideStatus(&end_status));
+    prev_server_side_stats_ = end_status;
+  }
+
+  RETURN_IF_ERROR(manager_->GetAccumulatedClientStat(&end_stat));
+  prev_client_side_stats_ = end_stat;
+
+  std::vector<RequestRecord> current_request_records;
+  RETURN_IF_ERROR(manager_->SwapRequestRecords(current_request_records));
+  all_request_records_.insert(
+      all_request_records_.end(), current_request_records.begin(),
+      current_request_records.end());
+
+  RETURN_IF_ERROR(Summarize(
+      start_status, end_status, start_stat, end_stat, perf_status,
+      window_start_ns, window_end_ns));
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::Summarize(
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+    const cb::InferStat& start_stat, const cb::InferStat& end_stat,
+    PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns)
+{
+  size_t valid_sequence_count = 0;
+  size_t delayed_request_count = 0;
+  size_t response_count = 0;
+
+  // Get measurement from requests that fall within the time interval
+  std::pair<uint64_t, uint64_t> valid_range{window_start_ns, window_end_ns};
+  uint64_t window_duration_ns = valid_range.second - valid_range.first;
+  std::vector<uint64_t> latencies;
+  std::vector<RequestRecord> valid_requests{};
+  ValidLatencyMeasurement(
+      valid_range, valid_sequence_count, delayed_request_count, &latencies,
+      response_count, valid_requests);
+
+  if (should_collect_profile_data_) {
+    CollectData(
+        summary, window_start_ns, window_end_ns, std::move(valid_requests));
+  }
+
+  RETURN_IF_ERROR(SummarizeLatency(latencies, summary));
+  RETURN_IF_ERROR(SummarizeClientStat(
+      start_stat, end_stat, window_duration_ns, latencies.size(),
+      valid_sequence_count, delayed_request_count, response_count, summary));
+  summary.client_stats.latencies = std::move(latencies);
+
+  SummarizeOverhead(window_duration_ns, manager_->GetIdleTime(), summary);
+
+  double window_duration_s{
+      window_duration_ns / static_cast<double>(NANOS_PER_SECOND)};
+
+  SummarizeSendRequestRate(
+      window_duration_s, manager_->GetAndResetNumSentRequests(), summary);
+
+  if (include_server_stats_) {
+    RETURN_IF_ERROR(SummarizeServerStats(
+        start_status, end_status, &(summary.server_stats)));
+  }
+
+  return cb::Error::Success;
+}
+
+void
+InferenceProfiler::ValidLatencyMeasurement(
+    const std::pair<uint64_t, uint64_t>& valid_range,
+    size_t& valid_sequence_count, size_t& delayed_request_count,
+    std::vector<uint64_t>* valid_latencies, size_t& response_count,
+    std::vector<RequestRecord>& valid_requests)
+{
+  valid_latencies->clear();
+  valid_sequence_count = 0;
+  response_count = 0;
+  std::vector<size_t> erase_indices{};
+  for (size_t i = 0; i < all_request_records_.size(); i++) {
+    const auto& request_record = all_request_records_[i];
+    uint64_t request_start_ns = CHRONO_TO_NANOS(request_record.start_time_);
+    uint64_t request_end_ns;
+
+    if (request_record.has_null_last_response_ == false) {
+      request_end_ns = CHRONO_TO_NANOS(request_record.response_times_.back());
+    } else if (request_record.response_times_.size() > 1) {
+      size_t last_response_idx{request_record.response_times_.size() - 2};
+      request_end_ns =
+          CHRONO_TO_NANOS(request_record.response_times_[last_response_idx]);
+    } else {
+      erase_indices.push_back(i);
+      continue;
+    }
+
+    if (request_start_ns <= request_end_ns) {
+      // Only counting requests that end within the time interval
+      if ((request_end_ns >= valid_range.first) &&
+          (request_end_ns <= valid_range.second)) {
+        valid_latencies->push_back(request_end_ns - request_start_ns);
+        response_count += request_record.response_times_.size();
+        if (request_record.has_null_last_response_) {
+          response_count--;
+        }
+        erase_indices.push_back(i);
+        if (request_record.sequence_end_) {
+          valid_sequence_count++;
+        }
+        if (request_record.delayed_) {
+          delayed_request_count++;
+        }
+      }
+    }
+  }
+
+  std::for_each(
+      erase_indices.begin(), erase_indices.end(),
+      [this, &valid_requests](size_t i) {
+        valid_requests.push_back(std::move(this->all_request_records_[i]));
+      });
+
+  // Iterate through erase indices backwards so that erases from
+  // `all_request_records_` happen from the back to the front to avoid using
+  // wrong indices after subsequent erases
+  std::for_each(erase_indices.rbegin(), erase_indices.rend(), [this](size_t i) {
+    this->all_request_records_.erase(this->all_request_records_.begin() + i);
+  });
+
+  // Always sort measured latencies as percentile will be reported as default
+  std::sort(valid_latencies->begin(), valid_latencies->end());
+}
+
+void
+InferenceProfiler::CollectData(
+    PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns,
+    std::vector<RequestRecord>&& request_records)
+{
+  InferenceLoadMode id{summary.concurrency, summary.request_rate};
+  collector_->AddWindow(id, window_start_ns, window_end_ns);
+  collector_->AddData(id, std::move(request_records));
+}
+
+cb::Error
+InferenceProfiler::SummarizeLatency(
+    const std::vector<uint64_t>& latencies, PerfStatus& summary)
+{
+  if (latencies.size() == 0) {
+    return cb::Error(
+        "No valid requests recorded within time interval."
+        " Please use a larger time window.",
+        pa::OPTION_ERROR);
+  }
+
+  std::tie(summary.client_stats.avg_latency_ns, summary.client_stats.std_us) =
+      GetMeanAndStdDev(latencies);
+
+  // retrieve other interesting percentile
+  summary.client_stats.percentile_latency_ns.clear();
+  std::set<size_t> percentiles{50, 90, 95, 99};
+  if (extra_percentile_) {
+    percentiles.emplace(percentile_);
+  }
+
+  for (const auto percentile : percentiles) {
+    size_t index = (percentile / 100.0) * (latencies.size() - 1) + 0.5;
+    summary.client_stats.percentile_latency_ns.emplace(
+        percentile, latencies[index]);
+  }
+
+  if (extra_percentile_) {
+    summary.stabilizing_latency_ns =
+        summary.client_stats.percentile_latency_ns.find(percentile_)->second;
+  } else {
+    summary.stabilizing_latency_ns = summary.client_stats.avg_latency_ns;
+  }
+
+  return cb::Error::Success;
+}
+
+std::tuple<uint64_t, uint64_t>
+InferenceProfiler::GetMeanAndStdDev(const std::vector<uint64_t>& latencies)
+{
+  uint64_t avg_latency_ns{0};
+  uint64_t std_dev_latency_us{0};
+
+  // calculate mean of latencies
+  uint64_t tol_latency_ns{
+      std::accumulate(latencies.begin(), latencies.end(), 0ULL)};
+  avg_latency_ns = tol_latency_ns / latencies.size();
+
+  // calculate sample standard deviation of latencies
+  uint64_t sq_sum_latency_avg_diff_ns{0};
+  std::for_each(
+      latencies.begin(), latencies.end(),
+      [avg_latency_ns, &sq_sum_latency_avg_diff_ns](uint64_t l) {
+        sq_sum_latency_avg_diff_ns += static_cast<int64_t>(l - avg_latency_ns) *
+                                      static_cast<int64_t>(l - avg_latency_ns);
+      });
+  if (latencies.size() > 1) {
+    std_dev_latency_us =
+        std::sqrt(sq_sum_latency_avg_diff_ns / (latencies.size() - 1)) / 1000;
+  } else {
+    std_dev_latency_us = UINT64_MAX;
+    std::cerr << "WARNING: Pass contained only one request, so sample latency "
+                 "standard deviation will be infinity (UINT64_MAX)."
+              << std::endl;
+  }
+
+
+  return std::make_tuple(avg_latency_ns, std_dev_latency_us);
+}
+
+cb::Error
+InferenceProfiler::SummarizeClientStat(
+    const cb::InferStat& start_stat, const cb::InferStat& end_stat,
+    const uint64_t duration_ns, const size_t valid_request_count,
+    const size_t valid_sequence_count, const size_t delayed_request_count,
+    const size_t response_count, PerfStatus& summary)
+{
+  summary.on_sequence_model =
+      ((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
+       (parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE));
+  summary.batch_size = std::max(manager_->BatchSize(), (size_t)1);
+  summary.client_stats.request_count = valid_request_count;
+  summary.client_stats.sequence_count = valid_sequence_count;
+  summary.client_stats.delayed_request_count = delayed_request_count;
+  summary.client_stats.response_count = response_count;
+  summary.client_stats.duration_ns = duration_ns;
+  float client_duration_sec =
+      (float)summary.client_stats.duration_ns / NANOS_PER_SECOND;
+  summary.client_stats.sequence_per_sec =
+      valid_sequence_count / client_duration_sec;
+  summary.client_stats.infer_per_sec =
+      (valid_request_count * summary.batch_size) / client_duration_sec;
+  summary.client_stats.responses_per_sec = response_count / client_duration_sec;
+
+  if (include_lib_stats_) {
+    size_t completed_count =
+        end_stat.completed_request_count - start_stat.completed_request_count;
+    uint64_t request_time_ns = end_stat.cumulative_total_request_time_ns -
+                               start_stat.cumulative_total_request_time_ns;
+    summary.client_stats.completed_count = completed_count;
+    uint64_t send_time_ns =
+        end_stat.cumulative_send_time_ns - start_stat.cumulative_send_time_ns;
+    uint64_t receive_time_ns = end_stat.cumulative_receive_time_ns -
+                               start_stat.cumulative_receive_time_ns;
+    if (completed_count != 0) {
+      summary.client_stats.avg_request_time_ns =
+          request_time_ns / completed_count;
+      summary.client_stats.avg_send_time_ns = send_time_ns / completed_count;
+      summary.client_stats.avg_receive_time_ns =
+          receive_time_ns / completed_count;
+    }
+  }
+
+  return cb::Error::Success;
+}
+
+void
+InferenceProfiler::SummarizeSendRequestRate(
+    const double window_duration_s, const size_t num_sent_requests,
+    PerfStatus& summary)
+{
+  if (window_duration_s <= 0.0) {
+    throw std::runtime_error("window_duration_s must be positive");
+  }
+
+  summary.send_request_rate = num_sent_requests / window_duration_s;
+}
+
+cb::Error
+InferenceProfiler::DetermineStatsModelVersion(
+    const cb::ModelIdentifier& model_identifier,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
+    int64_t* status_model_version)
+{
+  // If model_version is unspecified then look in the stats to find the
+  // version with stats that incremented during the measurement.
+  //
+  // If multiple versions had incremented stats, use the highest numbered one
+  // and print a warning
+  *status_model_version = -1;
+  bool multiple_found = false;
+  bool version_unspecified = model_identifier.second.empty();
+
+  if (version_unspecified) {
+    for (const auto& x : end_stats) {
+      const auto& end_id = x.first;
+      const auto& end_stat = x.second;
+
+      bool is_correct_model_name =
+          model_identifier.first.compare(end_id.first) == 0;
+
+      if (is_correct_model_name) {
+        uint64_t end_queue_count = end_stat.queue_count_;
+        uint64_t start_queue_count = 0;
+
+        const auto& itr = start_stats.find(end_id);
+        if (itr != start_stats.end()) {
+          start_queue_count = itr->second.queue_count_;
+        }
+
+        if (end_queue_count > start_queue_count) {
+          int64_t this_version = std::stoll(end_id.second);
+          if (*status_model_version != -1) {
+            multiple_found = true;
+          }
+          *status_model_version = std::max(*status_model_version, this_version);
+        }
+      }
+    }
+  } else {
+    const auto& itr = end_stats.find(model_identifier);
+    if (itr != end_stats.end()) {
+      *status_model_version = std::stoll(model_identifier.second);
+    }
+  }
+
+  if (*status_model_version == -1) {
+    return cb::Error(
+        "failed to find the requested model version", pa::GENERIC_ERROR);
+  }
+
+  if (multiple_found) {
+    std::cerr << "WARNING: Multiple versions of model "
+              << model_identifier.first
+              << " are loaded in the triton server, and the version to use was "
+                 "unspecified. The stats for that model may be inaccurate."
+              << std::endl;
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::SummarizeServerStats(
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+    ServerSideStats* server_stats)
+{
+  RETURN_IF_ERROR(SummarizeServerStats(
+      std::make_pair(parser_->ModelName(), parser_->ModelVersion()),
+      start_status, end_status, server_stats));
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::SummarizeServerStats(
+    const cb::ModelIdentifier& model_identifier,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+    ServerSideStats* server_stats)
+{
+  RETURN_IF_ERROR(SummarizeServerStatsHelper(
+      model_identifier, start_status, end_status, server_stats));
+
+  // Summarize the composing models, if any.
+  for (auto composing_model_identifier :
+       (*parser_->GetComposingModelMap())[model_identifier.first]) {
+    int64_t model_version;
+    RETURN_IF_ERROR(DetermineStatsModelVersion(
+        composing_model_identifier, start_status, end_status, &model_version));
+    composing_model_identifier.second = std::to_string(model_version);
+    auto it = server_stats->composing_models_stat
+                  .emplace(composing_model_identifier, ServerSideStats())
+                  .first;
+    RETURN_IF_ERROR(SummarizeServerStats(
+        composing_model_identifier, start_status, end_status, &(it->second)));
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+InferenceProfiler::SummarizeServerStatsHelper(
+    const cb::ModelIdentifier& model_identifier,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+    const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+    ServerSideStats* server_stats)
+{
+  int64_t model_version;
+  RETURN_IF_ERROR(DetermineStatsModelVersion(
+      model_identifier, start_status, end_status, &model_version));
+
+  const std::pair<std::string, std::string> this_id(
+      model_identifier.first, std::to_string(model_version));
+
+  const auto& end_itr = end_status.find(this_id);
+  if (end_itr == end_status.end()) {
+    return cb::Error(
+        "missing statistics for requested model", pa::GENERIC_ERROR);
+  } else {
+    uint64_t start_infer_cnt = 0;
+    uint64_t start_exec_cnt = 0;
+    uint64_t start_cnt = 0;
+    uint64_t start_queue_cnt = 0;
+    uint64_t start_compute_input_cnt = 0;
+    uint64_t start_compute_infer_cnt = 0;
+    uint64_t start_compute_output_cnt = 0;
+    uint64_t start_cumm_time_ns = 0;
+    uint64_t start_queue_time_ns = 0;
+    uint64_t start_compute_input_time_ns = 0;
+    uint64_t start_compute_infer_time_ns = 0;
+    uint64_t start_compute_output_time_ns = 0;
+    uint64_t start_cache_hit_cnt = 0;
+    uint64_t start_cache_hit_time_ns = 0;
+    uint64_t start_cache_miss_cnt = 0;
+    uint64_t start_cache_miss_time_ns = 0;
+
+    const auto& start_itr = start_status.find(this_id);
+    if (start_itr != start_status.end()) {
+      start_infer_cnt = start_itr->second.inference_count_;
+      start_exec_cnt = start_itr->second.execution_count_;
+      start_cnt = start_itr->second.success_count_;
+      start_queue_cnt = start_itr->second.queue_count_;
+      start_compute_input_cnt = start_itr->second.compute_input_count_;
+      start_compute_infer_cnt = start_itr->second.compute_infer_count_;
+      start_compute_output_cnt = start_itr->second.compute_output_count_;
+      start_cumm_time_ns = start_itr->second.cumm_time_ns_;
+      start_queue_time_ns = start_itr->second.queue_time_ns_;
+      start_compute_input_time_ns = start_itr->second.compute_input_time_ns_;
+      start_compute_infer_time_ns = start_itr->second.compute_infer_time_ns_;
+      start_compute_output_time_ns = start_itr->second.compute_output_time_ns_;
+      start_cache_hit_cnt = start_itr->second.cache_hit_count_;
+      start_cache_hit_time_ns = start_itr->second.cache_hit_time_ns_;
+      start_cache_miss_cnt = start_itr->second.cache_miss_count_;
+      start_cache_miss_time_ns = start_itr->second.cache_miss_time_ns_;
+    }
+
+    server_stats->inference_count =
+        end_itr->second.inference_count_ - start_infer_cnt;
+    server_stats->execution_count =
+        end_itr->second.execution_count_ - start_exec_cnt;
+    server_stats->success_count = end_itr->second.success_count_ - start_cnt;
+    server_stats->queue_count = end_itr->second.queue_count_ - start_queue_cnt;
+    server_stats->compute_input_count =
+        end_itr->second.compute_input_count_ - start_compute_input_cnt;
+    server_stats->compute_infer_count =
+        end_itr->second.compute_infer_count_ - start_compute_infer_cnt;
+    server_stats->compute_output_count =
+        end_itr->second.compute_output_count_ - start_compute_output_cnt;
+    server_stats->cumm_time_ns =
+        end_itr->second.cumm_time_ns_ - start_cumm_time_ns;
+    server_stats->queue_time_ns =
+        end_itr->second.queue_time_ns_ - start_queue_time_ns;
+    server_stats->compute_input_time_ns =
+        end_itr->second.compute_input_time_ns_ - start_compute_input_time_ns;
+    server_stats->compute_infer_time_ns =
+        end_itr->second.compute_infer_time_ns_ - start_compute_infer_time_ns;
+    server_stats->compute_output_time_ns =
+        end_itr->second.compute_output_time_ns_ - start_compute_output_time_ns;
+    server_stats->cache_hit_count =
+        end_itr->second.cache_hit_count_ - start_cache_hit_cnt;
+    server_stats->cache_hit_time_ns =
+        end_itr->second.cache_hit_time_ns_ - start_cache_hit_time_ns;
+    server_stats->cache_miss_count =
+        end_itr->second.cache_miss_count_ - start_cache_miss_cnt;
+    server_stats->cache_miss_time_ns =
+        end_itr->second.cache_miss_time_ns_ - start_cache_miss_time_ns;
+  }
+
+  return cb::Error::Success;
+}
+
+void
+InferenceProfiler::SummarizeOverhead(
+    const uint64_t window_duration_ns, const uint64_t idle_ns,
+    PerfStatus& summary)
+{
+  // The window start/stop is not instantaneous. It is possible that the PA
+  // overhead is smaller than the delay in the window start/stop process. Treat
+  // it as 0% overhead (100% idle) in that case
+  //
+  if (idle_ns > window_duration_ns) {
+    summary.overhead_pct = 0;
+  } else {
+    uint64_t overhead_ns = window_duration_ns - idle_ns;
+    double overhead_pct = double(overhead_ns) / window_duration_ns * 100;
+    summary.overhead_pct = overhead_pct;
+  }
+}
+
+bool
+InferenceProfiler::AllMPIRanksAreStable(bool current_rank_stability)
+{
+  int world_size{mpi_driver_->MPICommSizeWorld()};
+  std::vector<int> stabilities_per_rank{};
+  stabilities_per_rank.resize(world_size, 0);
+  int my_rank{mpi_driver_->MPICommRankWorld()};
+  stabilities_per_rank[my_rank] = static_cast<int>(current_rank_stability);
+
+  for (int rank{0}; rank < world_size; rank++) {
+    mpi_driver_->MPIBcastIntWorld(stabilities_per_rank.data() + rank, 1, rank);
+  }
+
+  bool all_stable{true};
+  for (int rank{0}; rank < world_size; rank++) {
+    if (stabilities_per_rank[rank] == 0) {
+      all_stable = false;
+      break;
+    }
+  }
+
+  if (verbose_ && all_stable) {
+    std::cout << "All models on all MPI ranks are stable" << std::endl;
+  }
+
+  return all_stable;
+}
+
+cb::Error
+InferenceProfiler::MergeMetrics(
+    const std::vector<std::reference_wrapper<const Metrics>>& all_metrics,
+    Metrics& merged_metrics)
+{
+  // Maps from each metric collection mapping gpu uuid to gpu utilization
+  std::vector<std::reference_wrapper<const std::map<std::string, double>>>
+      gpu_utilization_per_gpu_maps{};
+
+  // Maps from each metric collection mapping gpu uuid to gpu power usage
+  std::vector<std::reference_wrapper<const std::map<std::string, double>>>
+      gpu_power_usage_per_gpu_maps{};
+
+  // Maps from each metric collection mapping gpu uuid to gpu memory used bytes
+  std::vector<std::reference_wrapper<const std::map<std::string, uint64_t>>>
+      gpu_memory_used_bytes_per_gpu_maps{};
+
+  // Maps from each metric collection mapping gpu uuid to gpu memory total bytes
+  std::vector<std::reference_wrapper<const std::map<std::string, uint64_t>>>
+      gpu_memory_total_bytes_per_gpu_maps{};
+
+  // Put all metric maps in vector so they're easier to aggregate
+  std::for_each(
+      all_metrics.begin(), all_metrics.end(),
+      [&gpu_utilization_per_gpu_maps, &gpu_power_usage_per_gpu_maps,
+       &gpu_memory_used_bytes_per_gpu_maps,
+       &gpu_memory_total_bytes_per_gpu_maps](
+          const std::reference_wrapper<const Metrics> m) {
+        gpu_utilization_per_gpu_maps.push_back(m.get().gpu_utilization_per_gpu);
+        gpu_power_usage_per_gpu_maps.push_back(m.get().gpu_power_usage_per_gpu);
+        gpu_memory_used_bytes_per_gpu_maps.push_back(
+            m.get().gpu_memory_used_bytes_per_gpu);
+        gpu_memory_total_bytes_per_gpu_maps.push_back(
+            m.get().gpu_memory_total_bytes_per_gpu);
+      });
+
+  GetMetricAveragePerGPU<double>(
+      gpu_utilization_per_gpu_maps, merged_metrics.gpu_utilization_per_gpu);
+  GetMetricAveragePerGPU<double>(
+      gpu_power_usage_per_gpu_maps, merged_metrics.gpu_power_usage_per_gpu);
+  GetMetricMaxPerGPU<uint64_t>(
+      gpu_memory_used_bytes_per_gpu_maps,
+      merged_metrics.gpu_memory_used_bytes_per_gpu);
+  GetMetricFirstPerGPU<uint64_t>(
+      gpu_memory_total_bytes_per_gpu_maps,
+      merged_metrics.gpu_memory_total_bytes_per_gpu);
+
+  return cb::Error::Success;
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/inference_profiler.h
+++ b/src/c++/perf_analyzer/inference_profiler.h
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+#include "concurrency_manager.h"
+#include "constants.h"
+#include "custom_load_manager.h"
+#include "metrics.h"
+#include "metrics_manager.h"
+#include "model_parser.h"
+#include "mpi_utils.h"
+#include "profile_data_collector.h"
+#include "request_rate_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockInferenceProfiler;
+class TestInferenceProfiler;
+#endif
+
+/// Constant parameters that determine the whether stopping criteria has met
+/// for the current phase of testing
+struct LoadParams {
+  // The number of measurements to account for during calculation of load
+  // status
+  uint32_t stability_window;
+  // The +/- range to account for while assessing load status
+  double stability_threshold;
+};
+
+/// Data structure to keep track of real-time load status and determine whether
+/// stopping criteria has met for the current phase of testing.
+struct LoadStatus {
+  // Stores the observations of infer_per_sec and latencies in a vector
+  std::vector<double> infer_per_sec;
+  std::vector<uint64_t> latencies;
+  // Records the average inference per second within the stability window
+  double avg_ips = 0;
+  // Stores the average latency within the stability window
+  uint64_t avg_latency = 0;
+};
+
+// Holds the total of the timiming components of composing models of an
+// ensemble.
+struct EnsembleDurations {
+  EnsembleDurations()
+      : total_queue_time_avg_us(0), total_compute_time_avg_us(0),
+        total_cache_hit_time_avg_us(0), total_cache_miss_time_avg_us(0),
+        total_combined_cache_compute_time_avg_us(0)
+  {
+  }
+  uint64_t total_queue_time_avg_us;
+  uint64_t total_compute_time_avg_us;
+  // Time spent on cache lookups/copies for cache hits
+  uint64_t total_cache_hit_time_avg_us;
+  // Time spent on cache lookups/copies/insertions for cache misses
+  uint64_t total_cache_miss_time_avg_us;
+
+  // Combined average of cache and compute times
+  uint64_t total_combined_cache_compute_time_avg_us;
+};
+
+/// Holds the server-side inference statisitcs of the target model and its
+/// composing models
+struct ServerSideStats {
+  uint64_t inference_count;
+  uint64_t execution_count;
+  uint64_t cache_hit_count;
+  uint64_t cache_miss_count;
+  uint64_t success_count;
+  uint64_t queue_count;
+  uint64_t compute_input_count;
+  uint64_t compute_infer_count;
+  uint64_t compute_output_count;
+  uint64_t cumm_time_ns;
+  uint64_t queue_time_ns;
+  uint64_t compute_input_time_ns;
+  uint64_t compute_infer_time_ns;
+  uint64_t compute_output_time_ns;
+  // Time spent on cache lookups/copies for cache hits
+  uint64_t cache_hit_time_ns;
+  // Time spent on cache lookups/copies/insertions for cache misses
+  uint64_t cache_miss_time_ns;
+
+  std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
+};
+
+/// Holds the statistics recorded at the client side.
+struct ClientSideStats {
+  // Request count and elapsed time measured by client
+  uint64_t request_count;
+  // Only record sequences that finish within the measurement window
+  uint64_t sequence_count;
+  // The number of requests that missed their schedule
+  uint64_t delayed_request_count;
+  // The number of responses
+  uint64_t response_count;
+  uint64_t duration_ns;
+  uint64_t avg_latency_ns;
+  // a ordered map of percentiles to be reported (<percentile, value> pair)
+  std::map<size_t, uint64_t> percentile_latency_ns;
+  // List of all the valid latencies.
+  std::vector<uint64_t> latencies;
+  // Using usec to avoid square of large number (large in nsec)
+  uint64_t std_us;
+  uint64_t avg_request_time_ns;
+  uint64_t avg_send_time_ns;
+  uint64_t avg_receive_time_ns;
+  // Per sec stat
+  double infer_per_sec;
+  double responses_per_sec;
+  double sequence_per_sec;
+
+  // Completed request count reported by the client library
+  uint64_t completed_count;
+};
+
+/// The entire statistics record.
+struct PerfStatus {
+  uint32_t concurrency;
+  double request_rate;
+  size_t batch_size;
+  ServerSideStats server_stats;
+  ClientSideStats client_stats;
+  std::vector<Metrics> metrics{};
+  double overhead_pct;
+  bool on_sequence_model;
+
+  // placeholder for the latency value that is used for conditional checking
+  uint64_t stabilizing_latency_ns;
+  // Metric for requests sent per second
+  double send_request_rate{0.0};
+};
+
+cb::Error ReportPrometheusMetrics(const Metrics& metrics);
+
+//==============================================================================
+/// A InferenceProfiler is a helper class that measures and summarizes the
+/// inference statistic under different concurrency level.
+///
+/// The profiler can adjust the number of concurrent requests by informing the
+/// concurrency manager. And after the adjustment, the profiler will actively
+/// collecting the statistic from both the concurrency manager and the inference
+/// server directly until it is stable. Once stable, the profiler updates the
+/// 'status_summary' based on the most recent measurement.
+///
+/// The measurement procedure:
+/// 1. The profiler gets start status from the server and records the start
+/// time.
+/// 2. After given time interval, the profiler gets end status from the server
+///    and records the end time.
+/// 3. The profiler obtains the request records recorded by concurrency manager,
+///    and uses the request records that are recorded between start time and end
+///    time to measure client side status and update status_summary.
+///
+class InferenceProfiler {
+ public:
+  /// Create a profiler that collects and summarizes inference statistic.
+  /// \param verbose Whether to print verbose logging.
+  /// \param stability_threshold The range that the measurement is considered as
+  /// stable. i.e. within (1 +/- stability_threshold) * average value of the
+  /// last 3 measurements. The criteria are "infer per second" and "average
+  /// latency", or "infer per second" and "percentile latency" if valid
+  /// percentile is set (see 'percentile' below).
+  /// \param measurement_window_ms The duration of each measurement in msec.
+  /// \param max_trials The maximum number of attempts to obtain
+  /// stable measurement.
+  /// \param percentile The percentile in terms of latency to be reported.
+  /// if it is a valid percentile value, the percentile latency will reported
+  /// and used as stable criteria instead of average latency. If it is -1,
+  /// average latency will be reported and used as stable criteria.
+  /// \param latency_threshold_ms The threshold on the latency measurements in
+  /// microseconds.
+  /// \param parser The ModelParse object which holds all the details about the
+  /// model.
+  /// \param profile_backend The ClientBackend object used to communicate
+  /// with the server by profiler.
+  /// \param manager The LoadManager object that will produce load on the
+  /// server.
+  /// \param profiler Returns a new InferenceProfiler object.
+  /// \param measurement_request_count The number of requests to capture when
+  /// using "count_windows" mode.
+  /// \param measurement_mode The measurement mode to use for windows.
+  /// \param mpi_driver The driver class for MPI operations.
+  /// \param metrics_interval_ms The interval at which the server-side metrics
+  /// \param should_collect_metrics Whether server-side inference server metrics
+  /// should be collected.
+  /// \param overhead_pct_threshold User set threshold above which the PA
+  /// overhead is too significant to provide usable results.
+  /// \param collector Collector for the profile data from experiments
+  /// \param should_collect_profile_data Whether to collect profile data.
+  /// \return cb::Error object indicating success or failure.
+  static cb::Error Create(
+      const bool verbose, const double stability_threshold,
+      const uint64_t measurement_window_ms, const size_t max_trials,
+      const int64_t percentile, const uint64_t latency_threshold_ms,
+      const cb::ProtocolType protocol, std::shared_ptr<ModelParser>& parser,
+      std::shared_ptr<cb::ClientBackend> profile_backend,
+      std::unique_ptr<LoadManager> manager,
+      std::unique_ptr<InferenceProfiler>* profiler,
+      uint64_t measurement_request_count, MeasurementMode measurement_mode,
+      std::shared_ptr<MPIDriver> mpi_driver, const uint64_t metrics_interval_ms,
+      const bool should_collect_metrics, const double overhead_pct_threshold,
+      const std::shared_ptr<ProfileDataCollector> collector,
+      const bool should_collect_profile_data);
+
+  /// Performs the profiling on the given range with the given search algorithm.
+  /// For profiling using request rate invoke template with double, otherwise
+  /// invoke with size_t for concurrency search.
+  /// \param start The starting point of the search range.
+  /// \param end The ending point of the search range.
+  /// \param step The step size to move along the search range in linear search
+  /// or the precision in binary search.
+  /// \param search_mode The search algorithm to be applied.
+  /// \param summary Returns the trace of the measurement along the search
+  /// path.
+  /// \return cb::Error object indicating success or failure.
+  template <typename T>
+  cb::Error Profile(
+      const T start, const T end, const T step, const SearchMode search_mode,
+      std::vector<PerfStatus>& perf_statuses)
+  {
+    cb::Error err;
+    bool meets_threshold, is_stable;
+    if (search_mode == SearchMode::NONE) {
+      err = Profile(perf_statuses, meets_threshold, is_stable);
+      if (!err.IsOk()) {
+        return err;
+      }
+    } else if (search_mode == SearchMode::LINEAR) {
+      T current_value = start;
+      do {
+        err = Profile(current_value, perf_statuses, meets_threshold, is_stable);
+        if (!err.IsOk()) {
+          return err;
+        }
+        current_value += step;
+      } while (((current_value <= end) || (end == static_cast<T>(NO_LIMIT))) &&
+               (meets_threshold));
+      // If there was only one concurrency we swept over and it did not meet the
+      // stability threshold, we should return an error.
+      if (current_value == (start + step) && is_stable == false) {
+        return cb::Error(
+            "Failed to obtain stable measurement.", pa::STABILITY_ERROR);
+      }
+    } else {
+      err = Profile(start, perf_statuses, meets_threshold, is_stable);
+      if (!err.IsOk() || (!meets_threshold)) {
+        return err;
+      }
+      err = Profile(end, perf_statuses, meets_threshold, is_stable);
+      if (!err.IsOk() || (meets_threshold)) {
+        return err;
+      }
+
+      T this_start = start;
+      T this_end = end;
+      while ((this_end - this_start) > step) {
+        T current_value = (this_end + this_start) / 2;
+        err = Profile(current_value, perf_statuses, meets_threshold, is_stable);
+        if (!err.IsOk()) {
+          return err;
+        }
+        if (meets_threshold) {
+          this_start = current_value;
+        } else {
+          this_end = current_value;
+        }
+      }
+    }
+    return cb::Error::Success;
+  }
+
+  bool IncludeServerStats() { return include_server_stats_; }
+
+ private:
+  InferenceProfiler(
+      const bool verbose, const double stability_threshold,
+      const int32_t measurement_window_ms, const size_t max_trials,
+      const bool extra_percentile, const size_t percentile,
+      const uint64_t latency_threshold_ms, const cb::ProtocolType protocol,
+      std::shared_ptr<ModelParser>& parser,
+      std::shared_ptr<cb::ClientBackend> profile_backend,
+      std::unique_ptr<LoadManager> manager, uint64_t measurement_request_count,
+      MeasurementMode measurement_mode, std::shared_ptr<MPIDriver> mpi_driver,
+      const uint64_t metrics_interval_ms, const bool should_collect_metrics,
+      const double overhead_pct_threshold,
+      const std::shared_ptr<ProfileDataCollector> collector,
+      const bool should_collect_profile_data);
+
+  /// Actively measure throughput in every 'measurement_window' msec until the
+  /// throughput is stable. Once the throughput is stable, it adds the
+  /// observations on summary trace and returns whether the setting met the
+  /// threshold. NOTE: the requests are being sent regardless of the
+  /// measurement, so the data returned by the server (see struct
+  /// PerforamnceStatusStruct) will include more requests than what the client
+  /// measures (we can't get the exact server status right before the first
+  /// request and right after the last request in the measurement window).
+  /// \param concurrent_request_count The concurrency level for the measurement.
+  /// \param perf_statuses Appends the measurements summary at the end of this
+  /// list. \param meets_threshold Returns whether the setting meets the
+  /// threshold.
+  /// \param is_stable Returns whether the measurement is stable.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Profile(
+      const size_t concurrent_request_count,
+      std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
+      bool& is_stable);
+
+  /// Similar to above function, but instead of setting the concurrency, it
+  /// sets the specified request rate for measurements.
+  /// \param request_rate The request rate for inferences.
+  /// \param perf_statuses Appends the measurements summary at the end of this
+  /// list. \param meets_threshold Returns whether the setting meets the
+  /// threshold. \param is_stable Returns whether the measurement is stable.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Profile(
+      const double request_rate, std::vector<PerfStatus>& perf_statuses,
+      bool& meets_threshold, bool& is_stable);
+
+  /// Measures throughput and latencies for custom load without controlling
+  /// request rate nor concurrency. Requires load manager to be loaded with
+  /// a file specifying the time intervals.
+  /// \param perf_statuses Appends the measurements summary at the end of this
+  /// list. \param meets_threshold Returns whether the measurement met the
+  /// threshold. \param is_stable Returns whether the measurement is stable.
+  /// \return cb::Error object indicating success
+  /// or failure.
+  cb::Error Profile(
+      std::vector<PerfStatus>& perf_statuses, bool& meets_threshold,
+      bool& is_stable);
+
+  /// A helper function for profiling functions.
+  /// \param status_summary Returns the summary of the measurement.
+  /// \param is_stable Returns whether the measurement stabilized or not.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error ProfileHelper(PerfStatus& status_summary, bool* is_stable);
+
+  /// A helper function to determine if profiling is stable
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \return Returns if the threshold and latencies are stable.
+  bool DetermineStability(LoadStatus& load_status);
+
+  /// Check if latency at index idx is within the latency threshold
+  /// \param idx index in latency vector
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \return Returns whether the latencies are below the max threshold
+  bool CheckWithinThreshold(size_t idx, LoadStatus& load_status);
+
+  /// A helper function to determine if profiling is done
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \param is_stable Returns whether the measurement stabilized or not.
+  /// \return Returns if we should break out of the infinite stability check
+  /// loop.
+  bool IsDoneProfiling(LoadStatus& load_status, bool* is_stable);
+
+  /// Check if observed inferences and latencies are within threshold
+  /// for a single window starting at idx
+  /// \param idx index in latency vector
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \return Returns whether inference and latency are stable
+  bool CheckWindowForStability(size_t idx, LoadStatus& load_status);
+
+  /// Check if observed inferences are within threshold
+  /// for a single window starting at idx
+  /// \param idx index in latency vector
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \return Returns whether inference is stable
+  bool IsInferWindowStable(size_t idx, LoadStatus& load_status);
+
+  /// Check if observed latencies are within threshold
+  /// for a single window starting at idx
+  /// \param idx index in latency vector
+  /// \param load_status Stores the observations of infer_per_sec and latencies
+  /// \return Returns whether latency is stable
+  bool IsLatencyWindowStable(size_t idx, LoadStatus& load_status);
+
+  /// Helper function to perform measurement.
+  /// \param status_summary The summary of this measurement.
+  /// \param measurement_window Indicating the number of requests or the
+  /// duration in milliseconds to collect requests.
+  /// \param is_count_based determines whether measurement_window is indicating
+  /// time or count.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Measure(
+      PerfStatus& status_summary, uint64_t measurement_window,
+      bool is_count_based);
+
+  /// Gets the server side statistics
+  /// \param model_status Returns the status of the models provided by
+  /// the server. If the model being profiled is non-ensemble model,
+  /// only its status will be returned. Otherwise, the status of the composing
+  /// models will also be returned.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error GetServerSideStatus(
+      std::map<cb::ModelIdentifier, cb::ModelStatistics>* model_status);
+
+  /// Summarize the measurement with the provided statistics.
+  /// \param start_status The model status at the start of the measurement.
+  /// \param end_status The model status at the end of the measurement.
+  /// \param start_stat The accumulated context status at the start.
+  /// \param end_stat The accumulated context status at the end.
+  /// \param summary Returns the summary of the measurement.
+  /// \param window_start_ns The window start timestamp in nanoseconds.
+  /// \param window_end_ns The window end timestamp in nanoseconds.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error Summarize(
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+      const cb::InferStat& start_stat, const cb::InferStat& end_stat,
+      PerfStatus& summary, uint64_t window_start_ns, uint64_t window_end_ns);
+
+  /// \param valid_range The start and end timestamp of the measurement window.
+  /// \param valid_sequence_count Returns the number of completed sequences
+  /// during the measurement. A sequence is a set of correlated requests sent to
+  /// sequence model.
+  /// \param latencies Returns the vector of request latencies where the
+  /// requests are completed within the measurement window.
+  /// \param response_count Returns the number of responses
+  /// \param valid_requests Returns a vector of valid request records
+  virtual void ValidLatencyMeasurement(
+      const std::pair<uint64_t, uint64_t>& valid_range,
+      size_t& valid_sequence_count, size_t& delayed_request_count,
+      std::vector<uint64_t>* latencies, size_t& response_count,
+      std::vector<RequestRecord>& valid_requests);
+
+  /// Add the data from the request records to the Raw Data Collector
+  /// \param perf_status PerfStatus of the current measurement
+  /// \param window_start_ns The window start timestamp in nanoseconds.
+  /// \param window_end_ns The window end timestamp in nanoseconds.
+  /// \param request_records The request records to collect.
+  void CollectData(
+      PerfStatus& perf_status, uint64_t window_start_ns, uint64_t window_end_ns,
+      std::vector<RequestRecord>&& request_records);
+
+  /// \param latencies The vector of request latencies collected.
+  /// \param summary Returns the summary that the latency related fields are
+  /// set.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error SummarizeLatency(
+      const std::vector<uint64_t>& latencies, PerfStatus& summary);
+
+  /// \param latencies The vector of request latencies collected.
+  /// \return std::tuple object containing:
+  ///   * mean of latencies in nanoseconds
+  ///   * sample standard deviation of latencies in microseconds
+  std::tuple<uint64_t, uint64_t> GetMeanAndStdDev(
+      const std::vector<uint64_t>& latencies);
+
+  /// \param start_stat The accumulated client statistics at the start.
+  /// \param end_stat The accumulated client statistics at the end.
+  /// \param duration_ns The duration of the measurement in nsec.
+  /// \param valid_request_count The number of completed requests recorded.
+  /// \param valid_sequence_count The number of completed sequences recorded.
+  /// \param delayed_request_count The number of requests that missed their
+  /// schedule.
+  /// \param response_count The number of responses.
+  /// \param summary Returns the summary that the fields recorded by
+  /// client are set.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error SummarizeClientStat(
+      const cb::InferStat& start_stat, const cb::InferStat& end_stat,
+      const uint64_t duration_ns, const size_t valid_request_count,
+      const size_t delayed_request_count, const size_t valid_sequence_count,
+      const size_t response_count, PerfStatus& summary);
+
+  /// Adds the send request rate metric to the summary object.
+  /// \param window_duration_s The duration of the window in seconds.
+  /// \param num_sent_requests The number of requests sent during the last
+  /// window.
+  /// \param summary The summary object to be updated with the send request rate
+  /// metric.
+  void SummarizeSendRequestRate(
+      const double window_duration_s, const size_t num_sent_requests,
+      PerfStatus& summary);
+
+  /// Given a model_identifier to gather stats for, and a map of ALL stats,
+  /// determine which version of the model should be gathered
+  /// \param model_identifier A pair of model_name and model_version to identify
+  /// a specific model
+  /// \param start_stats The stats for all models at the start of the
+  ///  measurement
+  /// \param end_stats The stats for all models at the end of the measurement
+  /// \param model_version The determined model version
+  cb::Error DetermineStatsModelVersion(
+      const cb::ModelIdentifier& model_identifier,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
+      int64_t* model_version);
+
+  /// \param start_status The model status at the start of the measurement.
+  /// \param end_status The model status at the end of the measurement.
+  /// \param server_stats Returns the summary that the fields recorded by server
+  /// are set.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error SummarizeServerStats(
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+      ServerSideStats* server_stats);
+
+  /// \param model_identifier A pair of model_name and model_version to identify
+  /// a specific model.
+  /// \param start_status The model status at the start of the measurement.
+  /// \param end_status The model status at the end of the measurement.
+  /// \param server_stats Returns the summary that the fields recorded by server
+  /// are set.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error SummarizeServerStats(
+      const cb::ModelIdentifier& model_identifier,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+      ServerSideStats* server_stats);
+
+  /// \param model_identifier A pair of model_name and model_version to identify
+  /// a specific model.
+  /// \param start_status The model status at the start of the measurement.
+  /// \param end_status The model status at the end of the measurement.
+  /// \param server_stats Returns the summary that the fields recorded by server
+  /// are set.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error SummarizeServerStatsHelper(
+      const cb::ModelIdentifier& model_identifier,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
+      const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_status,
+      ServerSideStats* server_stats);
+
+  /// Calculate the overhead and put the results into the summary
+  ///
+  /// \param window_duration_ns The duration of the window
+  /// \param idle_ns The average worker idle time during the window
+  /// \param summary The summary object to be updated with overhead stats
+  ///
+  void SummarizeOverhead(
+      const uint64_t window_duration_ns, const uint64_t idle_ns,
+      PerfStatus& summary);
+
+  /// Returns true if all MPI ranks (models) are stable. Should only be run if
+  /// and only if IsMPIRun() returns true.
+  /// \param current_rank_stability The stability of the current rank.
+  /// \return True if all MPI ranks are stable.
+  bool AllMPIRanksAreStable(bool current_rank_stability);
+
+  /// Merge individual perf status reports into a single perf status.  This
+  /// function is used to merge the results from multiple Measure runs into a
+  /// single report.
+  /// \param perf_status List of perf status reports to be merged.
+  /// \param summary_status Final merged summary status.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error MergePerfStatusReports(
+      std::deque<PerfStatus>& perf_status, PerfStatus& summary_status);
+
+  /// Merge individual server side statistics into a single server side report.
+  /// \param server_side_stats List of server side statistics reports to be
+  /// merged.
+  /// \param server_side_summary Final merged summary status.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error MergeServerSideStats(
+      std::vector<ServerSideStats>& server_side_stats,
+      ServerSideStats& server_side_summary);
+
+  /// \param all_metrics Individual metrics from all intervals from stable
+  /// passes.
+  /// \param merged_metrics Output merged metrics from all intervals from stable
+  /// passes.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error MergeMetrics(
+      const std::vector<std::reference_wrapper<const Metrics>>& all_metrics,
+      Metrics& merged_metrics);
+
+  template <typename T>
+  void GetMetricAveragePerGPU(
+      const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
+          input_metric_maps,
+      std::map<std::string, T>& output_metric_map)
+  {
+    std::map<std::string, size_t> metric_count_per_gpu{};
+
+    for (const auto& input_metric_map : input_metric_maps) {
+      for (const auto& input_metric : input_metric_map.get()) {
+        const auto& gpu_uuid{input_metric.first};
+        const auto& metric{input_metric.second};
+
+        if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
+          output_metric_map[gpu_uuid] = 0;
+          metric_count_per_gpu[gpu_uuid] = 0;
+        }
+
+        output_metric_map[gpu_uuid] += metric;
+        metric_count_per_gpu[gpu_uuid]++;
+      }
+    }
+
+    for (auto& output_metric : output_metric_map) {
+      const auto& gpu_uuid{output_metric.first};
+      auto& metric{output_metric.second};
+      const auto& metric_count{metric_count_per_gpu[gpu_uuid]};
+      if (metric_count > 0) {
+        metric /= metric_count;
+      }
+    }
+  }
+
+  template <typename T>
+  void GetMetricMaxPerGPU(
+      const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
+          input_metric_maps,
+      std::map<std::string, T>& output_metric_map)
+  {
+    for (const auto& input_metric_map : input_metric_maps) {
+      for (const auto& input_metric : input_metric_map.get()) {
+        const auto& gpu_uuid{input_metric.first};
+        const auto& metric{input_metric.second};
+
+        if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
+          output_metric_map[gpu_uuid] = 0;
+        }
+
+        output_metric_map[gpu_uuid] =
+            std::max(output_metric_map[gpu_uuid], metric);
+      }
+    }
+  }
+
+  template <typename T>
+  void GetMetricFirstPerGPU(
+      const std::vector<std::reference_wrapper<const std::map<std::string, T>>>&
+          input_metric_maps,
+      std::map<std::string, T>& output_metric_map)
+  {
+    for (const auto& input_metric_map : input_metric_maps) {
+      for (const auto& input_metric : input_metric_map.get()) {
+        const auto& gpu_uuid{input_metric.first};
+        const auto& metric{input_metric.second};
+
+        if (output_metric_map.find(gpu_uuid) == output_metric_map.end()) {
+          output_metric_map[gpu_uuid] = metric;
+        }
+      }
+    }
+  }
+
+  bool verbose_;
+  uint64_t measurement_window_ms_;
+  uint64_t measurement_request_count_;
+  MeasurementMode measurement_mode_;
+  size_t max_trials_;
+  bool extra_percentile_;
+  size_t percentile_;
+  uint64_t latency_threshold_ms_;
+
+  cb::ProtocolType protocol_;
+  std::string model_name_;
+  int64_t model_version_;
+
+  std::shared_ptr<ModelParser> parser_;
+  std::shared_ptr<cb::ClientBackend> profile_backend_;
+  std::unique_ptr<LoadManager> manager_;
+  std::shared_ptr<ProfileDataCollector> collector_;
+  LoadParams load_parameters_;
+
+  bool include_lib_stats_;
+  bool include_server_stats_;
+  std::shared_ptr<MPIDriver> mpi_driver_;
+
+  /// The request records of the requests completed during all measurements
+  std::vector<RequestRecord> all_request_records_;
+
+  /// The end time of the previous measurement window
+  uint64_t previous_window_end_ns_;
+
+  /// Server side statistics from the previous measurement window
+  std::map<cb::ModelIdentifier, cb::ModelStatistics> prev_server_side_stats_;
+
+  /// Client side statistics from the previous measurement window
+  cb::InferStat prev_client_side_stats_;
+
+  /// Metrics manager that collects server-side metrics periodically
+  std::shared_ptr<MetricsManager> metrics_manager_{nullptr};
+
+  /// Whether server-side inference server metrics should be collected.
+  bool should_collect_metrics_{false};
+
+  /// User set threshold above which the PA overhead is too significant to
+  /// provide usable results.
+  const double overhead_pct_threshold_{0.0};
+
+  // Whether to collect profile data.
+  bool should_collect_profile_data_{false};
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockInferenceProfiler;
+  friend TestInferenceProfiler;
+
+ public:
+  InferenceProfiler() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/ischeduler.h
+++ b/src/c++/perf_analyzer/ischeduler.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "rate_schedule.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Interface for worker threads that use a schedule
+///
+class IScheduler {
+ public:
+  /// Provides the schedule that should be followed
+  ///
+  virtual void SetSchedule(RateSchedulePtr_t schedule) = 0;
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/iworker.h
+++ b/src/c++/perf_analyzer/iworker.h
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+namespace triton { namespace perfanalyzer {
+
+/// Interface for worker threads that generate inference requests
+///
+class IWorker {
+ public:
+  virtual void Infer() = 0;
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/load_manager.cc
+++ b/src/c++/perf_analyzer/load_manager.cc
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "load_manager.h"
+
+#include <algorithm>
+
+#include "client_backend/client_backend.h"
+#include "infer_data_manager_factory.h"
+
+namespace triton { namespace perfanalyzer {
+
+
+cb::Error
+LoadManager::CheckHealth()
+{
+  // Check thread status to make sure that the load setting is
+  // consistent to the one being reported
+  // If some thread return early, main thread will return and
+  // the worker thread's error message will be reported
+  // when derived class destructor gets called.
+  for (auto& thread_stat : threads_stat_) {
+    if (!thread_stat->status_.IsOk()) {
+      return cb::Error(
+          "Failed to maintain requested inference load."
+          " Worker thread(s) failed to generate concurrent requests.",
+          pa::GENERIC_ERROR);
+    }
+    if (!thread_stat->cb_status_.IsOk()) {
+      return cb::Error(
+          "Failed to retrieve results from inference request.",
+          pa::GENERIC_ERROR);
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+LoadManager::SwapRequestRecords(std::vector<RequestRecord>& new_request_records)
+{
+  std::vector<RequestRecord> total_request_records;
+  // Gather request records with proper locking from all the worker threads
+  for (auto& thread_stat : threads_stat_) {
+    std::lock_guard<std::mutex> lock(thread_stat->mu_);
+    total_request_records.insert(
+        total_request_records.end(), thread_stat->request_records_.begin(),
+        thread_stat->request_records_.end());
+    thread_stat->request_records_.clear();
+  }
+  // Swap the results
+  total_request_records.swap(new_request_records);
+  return cb::Error::Success;
+}
+
+uint64_t
+LoadManager::CountCollectedRequests()
+{
+  uint64_t num_of_requests = 0;
+  for (auto& thread_stat : threads_stat_) {
+    std::lock_guard<std::mutex> lock(thread_stat->mu_);
+    num_of_requests += thread_stat->request_records_.size();
+  }
+  return num_of_requests;
+}
+
+cb::Error
+LoadManager::GetAccumulatedClientStat(cb::InferStat* contexts_stat)
+{
+  contexts_stat->completed_request_count = 0;
+  contexts_stat->cumulative_receive_time_ns = 0;
+  contexts_stat->cumulative_send_time_ns = 0;
+  contexts_stat->cumulative_total_request_time_ns = 0;
+
+  for (auto& thread_stat : threads_stat_) {
+    std::lock_guard<std::mutex> lock(thread_stat->mu_);
+    for (auto& context_stat : thread_stat->contexts_stat_) {
+      contexts_stat->completed_request_count +=
+          context_stat.completed_request_count;
+      contexts_stat->cumulative_total_request_time_ns +=
+          context_stat.cumulative_total_request_time_ns;
+      contexts_stat->cumulative_send_time_ns +=
+          context_stat.cumulative_send_time_ns;
+      contexts_stat->cumulative_receive_time_ns +=
+          context_stat.cumulative_receive_time_ns;
+    }
+  }
+  return cb::Error::Success;
+}
+
+uint64_t
+LoadManager::GetIdleTime()
+{
+  uint64_t total{0};
+  size_t num_active_threads = 0;
+  for (auto& thread_stat : threads_stat_) {
+    std::lock_guard<std::mutex> lock(thread_stat->mu_);
+    uint64_t idle_time = thread_stat->idle_timer.GetIdleTime();
+    if (idle_time) {
+      total += idle_time;
+      num_active_threads++;
+    }
+  }
+
+  // TODO REFACTOR TMA-1043 InferDataManager should have an API to get
+  // num_active_threads. This method of determining active threads isn't fully
+  // accurate
+  if (num_active_threads) {
+    total /= num_active_threads;
+  }
+
+  return total;
+}
+
+void
+LoadManager::ResetIdleTime()
+{
+  for (auto& thread_stat : threads_stat_) {
+    std::lock_guard<std::mutex> lock(thread_stat->mu_);
+    thread_stat->idle_timer.Reset();
+  }
+}
+
+const size_t
+LoadManager::GetAndResetNumSentRequests()
+{
+  size_t num_sent_requests{0};
+
+  for (auto& thread_stat : threads_stat_) {
+    num_sent_requests += thread_stat->num_sent_requests_;
+    thread_stat->num_sent_requests_ = 0;
+  }
+
+  return num_sent_requests;
+}
+
+LoadManager::LoadManager(
+    const bool async, const bool streaming, const int32_t batch_size,
+    const size_t max_threads, const SharedMemoryType shared_memory_type,
+    const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory)
+    : async_(async), streaming_(streaming), batch_size_(batch_size),
+      max_threads_(max_threads), parser_(parser), factory_(factory),
+      using_json_data_(false)
+{
+  on_sequence_model_ =
+      ((parser_->SchedulerType() == ModelParser::SEQUENCE) ||
+       (parser_->SchedulerType() == ModelParser::ENSEMBLE_SEQUENCE));
+
+  data_loader_.reset(new DataLoader(batch_size_));
+
+  infer_data_manager_ = InferDataManagerFactory::CreateInferDataManager(
+      max_threads, batch_size, shared_memory_type, output_shm_size, parser,
+      factory, data_loader_);
+}
+
+void
+LoadManager::InitManager(
+    const size_t string_length, const std::string& string_data,
+    const bool zero_input, std::vector<std::string>& user_data,
+    const uint64_t start_sequence_id, const uint64_t sequence_id_range,
+    const size_t sequence_length, const bool sequence_length_specified,
+    const double sequence_length_variation)
+{
+  // Note, this is already caught by the CLI, but adding it here for extra
+  // protection
+  if (on_sequence_model_ && batch_size_ > 1) {
+    throw PerfAnalyzerException(
+        "error: sequence models do not support batching", GENERIC_ERROR);
+  }
+
+  auto status =
+      InitManagerInputs(string_length, string_data, zero_input, user_data);
+  THROW_IF_ERROR(status, "Failed to init manager inputs");
+
+  THROW_IF_ERROR(
+      infer_data_manager_->Init(), "Unable to init infer data manager");
+
+  sequence_manager_ = MakeSequenceManager(
+      start_sequence_id, sequence_id_range, sequence_length,
+      sequence_length_specified, sequence_length_variation, using_json_data_,
+      data_loader_);
+
+  InitManagerFinalize();
+}
+
+cb::Error
+LoadManager::InitManagerInputs(
+    const size_t string_length, const std::string& string_data,
+    const bool zero_input, std::vector<std::string>& user_data)
+{
+  RETURN_IF_ERROR(factory_->CreateClientBackend(&backend_));
+
+  // Read provided data
+  if (!user_data.empty()) {
+    if (IsDirectory(user_data[0])) {
+      RETURN_IF_ERROR(data_loader_->ReadDataFromDir(
+          parser_->Inputs(), parser_->Outputs(), user_data[0]));
+    } else {
+      using_json_data_ = true;
+      for (const auto& json_file : user_data) {
+        RETURN_IF_ERROR(data_loader_->ReadDataFromJSON(
+            parser_->Inputs(), parser_->Outputs(), json_file));
+      }
+      std::cout << " Successfully read data for "
+                << data_loader_->GetDataStreamsCount() << " stream/streams";
+      if (data_loader_->GetDataStreamsCount() == 1) {
+        std::cout << " with " << data_loader_->GetTotalSteps(0)
+                  << " step/steps";
+      }
+      std::cout << "." << std::endl;
+    }
+  } else {
+    RETURN_IF_ERROR(data_loader_->GenerateData(
+        parser_->Inputs(), zero_input, string_length, string_data));
+  }
+
+  // Reserve the required vector space
+  threads_stat_.reserve(max_threads_);
+
+  return cb::Error::Success;
+}
+
+void
+LoadManager::StopWorkerThreads()
+{
+  early_exit = true;
+  // wake up all threads
+  wake_signal_.notify_all();
+
+  size_t cnt = 0;
+  for (auto& thread : threads_) {
+    thread.join();
+    if (!threads_stat_[cnt]->status_.IsOk()) {
+      std::cerr << "Thread [" << cnt
+                << "] had error: " << (threads_stat_[cnt]->status_)
+                << std::endl;
+    }
+    if (!threads_stat_[cnt]->cb_status_.IsOk()) {
+      std::cerr << "Thread [" << cnt
+                << "] had error: " << (threads_stat_[cnt]->cb_status_)
+                << std::endl;
+    }
+    cnt++;
+  }
+  threads_.clear();
+}
+
+std::shared_ptr<SequenceManager>
+LoadManager::MakeSequenceManager(
+    const uint64_t start_sequence_id, const uint64_t sequence_id_range,
+    const size_t sequence_length, const bool sequence_length_specified,
+    const double sequence_length_variation, const bool using_json_data,
+    std::shared_ptr<DataLoader> data_loader)
+{
+  return std::make_shared<SequenceManager>(
+      start_sequence_id, sequence_id_range, sequence_length,
+      sequence_length_specified, sequence_length_variation, using_json_data,
+      data_loader);
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/load_manager.h
+++ b/src/c++/perf_analyzer/load_manager.h
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <random>
+#include <thread>
+
+#include "client_backend/client_backend.h"
+#include "data_loader.h"
+#include "iinfer_data_manager.h"
+#include "load_worker.h"
+#include "perf_utils.h"
+#include "sequence_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockLoadManager;
+#endif
+
+class LoadManager {
+ public:
+  virtual ~LoadManager() = default;
+
+  /// Initialize the Manager class to set up shared memory and inputs
+  /// \param string_length The length of the random strings to be generated
+  /// for string inputs.
+  /// \param string_data The string to be used as string inputs for model.
+  /// \param zero_input Whether to use zero for model inputs.
+  /// \param user_data The vector containing path/paths to user-provided data
+  /// that can be a directory or path to a json data file.
+  /// \param start_sequence_id The starting sequence ID to be used for iterating
+  /// through valid sequence IDs.
+  /// \param sequence_id_range The maximum sequence ID to be used for iterating
+  /// through valid sequence IDs.
+  /// \param sequence_length The base length of new sequences.
+  /// \param sequence_length_specified Whether the user specified the sequence
+  /// length.
+  /// \param sequence_length_variation The percentage variation in length of
+  /// sequences using autogenerated data as input.
+  void InitManager(
+      const size_t string_length, const std::string& string_data,
+      const bool zero_input, std::vector<std::string>& user_data,
+      const uint64_t start_sequence_id, const uint64_t sequence_id_range,
+      const size_t sequence_length, const bool sequence_length_specified,
+      const double sequence_length_variation);
+
+  /// Check if the load manager is working as expected.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error CheckHealth();
+
+  /// Swap the content of the request records vector recorded by the load
+  /// manager with a new request records vector
+  /// \param new_request_records The request records vector to be swapped.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error SwapRequestRecords(std::vector<RequestRecord>& new_request_records);
+
+  /// Get the sum of all contexts' stat
+  /// \param contexts_stat Returned the accumulated stat from all contexts
+  /// in load manager
+  cb::Error GetAccumulatedClientStat(cb::InferStat* contexts_stat);
+
+  /// Returns the amount of valid time each worker thread has averaged in
+  /// nanoseconds
+  ///
+  uint64_t GetIdleTime();
+
+  /// Resets the counter for tracking valid time
+  ///
+  void ResetIdleTime();
+
+  /// Calculates and returns the total number of sent requests across all
+  /// threads. Resets individual number of sent requests per thread.
+  /// \return The total number of sent requests across all threads.
+  const size_t GetAndResetNumSentRequests();
+
+  /// \return the batch size used for the inference requests
+  virtual size_t BatchSize() const { return batch_size_; }
+
+  /// Count the number of requests collected until now.
+  uint64_t CountCollectedRequests();
+
+ protected:
+  LoadManager(
+      const bool async, const bool streaming, const int32_t batch_size,
+      const size_t max_threads, const SharedMemoryType shared_memory_type,
+      const size_t output_shm_size, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory);
+
+  /// Complete any subclass-specific manager initialization tasks.
+  virtual void InitManagerFinalize() {}
+
+  /// Helper function to retrieve the input data for the inferences
+  /// \param string_length The length of the random strings to be generated
+  /// for string inputs.
+  /// \param string_data The string to be used as string inputs for model.
+  /// \param zero_input Whether to use zero for model inputs.
+  /// \param user_data The vector containing path/paths to user-provided data
+  /// that can be a directory or path to a json data file.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error InitManagerInputs(
+      const size_t string_length, const std::string& string_data,
+      const bool zero_input, std::vector<std::string>& user_data);
+
+  /// Stops all the worker threads generating the request load.
+  void StopWorkerThreads();
+
+ protected:
+  bool async_;
+  bool streaming_;
+  size_t batch_size_;
+  size_t max_threads_;
+  bool on_sequence_model_;
+
+  std::shared_ptr<ModelParser> parser_;
+  std::shared_ptr<cb::ClientBackendFactory> factory_;
+
+  bool using_json_data_;
+
+  std::shared_ptr<DataLoader> data_loader_;
+  std::unique_ptr<cb::ClientBackend> backend_;
+  std::shared_ptr<IInferDataManager> infer_data_manager_;
+
+  // Track the workers so they all go out of scope at the
+  // same time
+  std::vector<std::shared_ptr<IWorker>> workers_;
+
+  // Worker threads that loads the server with inferences
+  std::vector<std::thread> threads_;
+  // Contains the statistics on the current working threads
+  std::vector<std::shared_ptr<ThreadStat>> threads_stat_;
+
+  // Use condition variable to pause/continue worker threads
+  std::condition_variable wake_signal_;
+  std::mutex wake_mutex_;
+
+  std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
+
+  virtual std::shared_ptr<SequenceManager> MakeSequenceManager(
+      const uint64_t start_sequence_id, const uint64_t sequence_id_range,
+      const size_t sequence_length, const bool sequence_length_specified,
+      const double sequence_length_variation, const bool using_json_data,
+      std::shared_ptr<DataLoader> data_loader);
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockLoadManager;
+
+ public:
+  LoadManager() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/load_worker.cc
+++ b/src/c++/perf_analyzer/load_worker.cc
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "load_worker.h"
+
+#include <algorithm>
+#include <thread>
+
+#include "client_backend/client_backend.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+bool
+LoadWorker::ShouldExit()
+{
+  return early_exit || !thread_stat_->cb_status_.IsOk() ||
+         !thread_stat_->status_.IsOk();
+}
+
+bool
+LoadWorker::HandleExitConditions()
+{
+  if (ShouldExit()) {
+    CompleteOngoingSequences();
+    WaitForOngoingRequests();
+    return true;
+  }
+  return false;
+}
+
+void
+LoadWorker::CompleteOngoingSequences()
+{
+  if (on_sequence_model_) {
+    for (size_t ctx_id = 0; ctx_id < ctxs_.size(); ++ctx_id) {
+      size_t seq_stat_index = GetSeqStatIndex(ctx_id);
+      ctxs_[ctx_id]->CompleteOngoingSequence(seq_stat_index);
+    }
+  }
+}
+
+void
+LoadWorker::WaitForOngoingRequests()
+{
+  while (GetNumOngoingRequests() != 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+}
+
+uint
+LoadWorker::GetNumOngoingRequests()
+{
+  uint num = 0;
+  for (auto ctx : ctxs_) {
+    num += ctx->GetNumOngoingRequests();
+  }
+  return num;
+}
+
+void
+LoadWorker::CreateContext()
+{
+  auto ctx = CreateInferContext();
+  ctx->Init();
+  CreateContextFinalize(ctx);
+  ctxs_.push_back(ctx);
+}
+
+uint32_t
+LoadWorker::GetCtxId()
+{
+  std::lock_guard<std::mutex> lk(cb_mtx_);
+  return ctx_id_tracker_->Get();
+}
+
+
+void
+LoadWorker::RestoreFreeCtxId(uint32_t ctx_id)
+{
+  if (!async_) {
+    {
+      std::lock_guard<std::mutex> lock(cb_mtx_);
+      ctx_id_tracker_->Restore(ctx_id);
+    }
+  }
+}
+
+void
+LoadWorker::AsyncCallbackFinalize(uint32_t ctx_id)
+{
+  // avoid competition over 'cb_mtx_'
+  {
+    std::lock_guard<std::mutex> lk(cb_mtx_);
+    ctx_id_tracker_->Restore(ctx_id);
+    notified_ = true;
+  }
+
+  cb_cv_.notify_all();
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/load_worker.h
+++ b/src/c++/perf_analyzer/load_worker.h
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <queue>
+
+#include "ctx_id_tracker_factory.h"
+#include "data_loader.h"
+#include "infer_context.h"
+#include "iworker.h"
+#include "model_parser.h"
+#include "sequence_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Abstract base class for worker threads
+///
+class LoadWorker : public IWorker {
+ protected:
+  LoadWorker(
+      uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
+      const std::shared_ptr<ModelParser> parser,
+      std::shared_ptr<DataLoader> data_loader,
+      const std::shared_ptr<cb::ClientBackendFactory> factory,
+      const bool on_sequence_model, const bool async, const bool streaming,
+      const int32_t batch_size, const bool using_json_data,
+      std::condition_variable& wake_signal, std::mutex& wake_mutex,
+      bool& execute,
+      const std::shared_ptr<IInferDataManager>& infer_data_manager,
+      std::shared_ptr<SequenceManager> sequence_manager)
+      : id_(id), thread_stat_(thread_stat), parser_(parser),
+        data_loader_(data_loader), factory_(factory),
+        on_sequence_model_(on_sequence_model), async_(async),
+        streaming_(streaming), batch_size_(batch_size),
+        using_json_data_(using_json_data), wake_signal_(wake_signal),
+        wake_mutex_(wake_mutex), execute_(execute),
+        infer_data_manager_(infer_data_manager),
+        sequence_manager_(sequence_manager)
+  {
+  }
+
+  virtual ~LoadWorker() = default;
+
+ protected:
+  // Return the total number of async requests that have started and not
+  // finished
+  uint GetNumOngoingRequests();
+
+  void SendInferRequest(uint32_t ctx_id, bool delayed = false)
+  {
+    if (ShouldExit()) {
+      return;
+    }
+
+    if (on_sequence_model_) {
+      uint32_t seq_stat_index = GetSeqStatIndex(ctx_id);
+      ctxs_[ctx_id]->SendSequenceInferRequest(seq_stat_index, delayed);
+    } else {
+      ctxs_[ctx_id]->SendInferRequest(delayed);
+    }
+  }
+
+  virtual std::shared_ptr<InferContext> CreateInferContext()
+  {
+    return std::make_shared<InferContext>(
+        id_, ctxs_.size(), async_, streaming_, on_sequence_model_,
+        using_json_data_, batch_size_, thread_stat_, data_loader_, parser_,
+        factory_, execute_, infer_data_manager_, sequence_manager_);
+  }
+
+  // Create an inference context and add it to ctxs_
+  virtual void CreateContext();
+
+  // Any code that needs to execute after the Context has been created
+  virtual void CreateContextFinalize(std::shared_ptr<InferContext> ctx) = 0;
+
+  // Detect the cases where this thread needs to exit
+  bool ShouldExit();
+
+  // Detect and handle the case where this thread needs to exit
+  // Returns true if an exit condition was met
+  bool HandleExitConditions();
+  void CompleteOngoingSequences();
+  void WaitForOngoingRequests();
+
+  virtual uint32_t GetSeqStatIndex(uint32_t ctx_id) = 0;
+  uint32_t GetCtxId();
+  void RestoreFreeCtxId(uint32_t ctx_id);
+
+  void AsyncCallbackFinalize(uint32_t ctx_id);
+
+  uint32_t id_;
+
+  std::vector<std::shared_ptr<InferContext>> ctxs_;
+  std::shared_ptr<ICtxIdTracker> ctx_id_tracker_;
+
+  // Variables used to signal async request completion
+  bool notified_ = false;
+  std::mutex cb_mtx_;
+  std::condition_variable cb_cv_;
+
+  // TODO REFACTOR TMA-1017 is there a better way to do threading than to pass
+  // the same cv/mutex into every thread by reference? Used to wake up this
+  // thread if it has been put to sleep
+  std::condition_variable& wake_signal_;
+  std::mutex& wake_mutex_;
+
+  // TODO REFACTOR TMA-1017 is there a better way to communicate this than a
+  // shared bool reference? Used to pause execution of this thread
+  bool& execute_;
+
+  // Stats for this thread
+  std::shared_ptr<ThreadStat> thread_stat_;
+
+  std::shared_ptr<DataLoader> data_loader_;
+  const std::shared_ptr<ModelParser> parser_;
+  const std::shared_ptr<cb::ClientBackendFactory> factory_;
+  const std::shared_ptr<IInferDataManager> infer_data_manager_;
+
+  const bool on_sequence_model_;
+  const bool async_;
+  const bool streaming_;
+  const int32_t batch_size_;
+  const bool using_json_data_;
+
+  std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/main.cc
+++ b/src/c++/perf_analyzer/main.cc
+// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "perf_analyzer.h"
+#include "perf_analyzer_exception.h"
+
+namespace pa = triton::perfanalyzer;
+
+int
+main(int argc, char* argv[])
+{
+  try {
+    triton::perfanalyzer::CLParser clp;
+    pa::PAParamsPtr params = clp.Parse(argc, argv);
+
+    PerfAnalyzer analyzer(params);
+    analyzer.Run();
+  }
+  catch (pa::PerfAnalyzerException& e) {
+    return e.GetError();
+  }
+
+  return 0;
+}
--- a/src/c++/perf_analyzer/metrics.h
+++ b/src/c++/perf_analyzer/metrics.h
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <string>
+
+namespace triton { namespace perfanalyzer {
+
+/// Struct that holds server-side metrics for the inference server.
+/// The keys for each map are GPU UUIDs and the values are described in the
+/// variable names.
+struct Metrics {
+  std::map<std::string, double> gpu_utilization_per_gpu{};
+  std::map<std::string, double> gpu_power_usage_per_gpu{};
+  std::map<std::string, uint64_t> gpu_memory_used_bytes_per_gpu{};
+  std::map<std::string, uint64_t> gpu_memory_total_bytes_per_gpu{};
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/metrics_manager.cc
+++ b/src/c++/perf_analyzer/metrics_manager.cc
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "metrics_manager.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <utility>
+
+#include "constants.h"
+#include "perf_analyzer_exception.h"
+
+namespace triton { namespace perfanalyzer {
+
+MetricsManager::MetricsManager(
+    std::shared_ptr<clientbackend::ClientBackend> client_backend,
+    uint64_t metrics_interval_ms)
+    : client_backend_(client_backend), metrics_interval_ms_(metrics_interval_ms)
+{
+}
+
+MetricsManager::~MetricsManager()
+{
+  if (query_loop_future_.valid()) {
+    StopQueryingMetrics();
+  }
+}
+
+void
+MetricsManager::StartQueryingMetrics()
+{
+  should_keep_querying_ = true;
+  query_loop_future_ =
+      std::async(&MetricsManager::QueryMetricsEveryNMilliseconds, this);
+}
+
+void
+MetricsManager::QueryMetricsEveryNMilliseconds()
+{
+  while (should_keep_querying_) {
+    const auto& start{std::chrono::system_clock::now()};
+
+    Metrics metrics{};
+    clientbackend::Error err{client_backend_->Metrics(metrics)};
+    if (err.IsOk() == false) {
+      throw PerfAnalyzerException(err.Message(), err.Err());
+    }
+
+    CheckForMissingMetrics(metrics);
+
+    {
+      std::lock_guard<std::mutex> metrics_lock{metrics_mutex_};
+      metrics_.push_back(std::move(metrics));
+    }
+
+    const auto& end{std::chrono::system_clock::now()};
+    const auto& duration{end - start};
+    const auto& remainder{
+        std::chrono::milliseconds(metrics_interval_ms_) - duration};
+
+    CheckForMetricIntervalTooShort(remainder, duration);
+
+    {
+      std::unique_lock<std::mutex> query_loop_lock{query_loop_mutex_};
+      query_loop_cv_.wait_for(query_loop_lock, remainder);
+    }
+  }
+}
+
+void
+MetricsManager::CheckForMissingMetrics(const Metrics& metrics)
+{
+  if (has_given_missing_metrics_warning_) {
+    return;
+  }
+  if (metrics.gpu_utilization_per_gpu.empty()) {
+    std::cerr << "WARNING: Unable to parse 'nv_gpu_utilization' metric."
+              << std::endl;
+    has_given_missing_metrics_warning_ = true;
+  }
+  if (metrics.gpu_power_usage_per_gpu.empty()) {
+    std::cerr << "WARNING: Unable to parse 'nv_gpu_power_usage' metric."
+              << std::endl;
+    has_given_missing_metrics_warning_ = true;
+  }
+  if (metrics.gpu_memory_used_bytes_per_gpu.empty()) {
+    std::cerr << "WARNING: Unable to parse 'nv_gpu_memory_used_bytes' metric."
+              << std::endl;
+    has_given_missing_metrics_warning_ = true;
+  }
+  if (metrics.gpu_memory_total_bytes_per_gpu.empty()) {
+    std::cerr << "WARNING: Unable to parse 'nv_gpu_memory_total_bytes' metric."
+              << std::endl;
+    has_given_missing_metrics_warning_ = true;
+  }
+}
+
+void
+MetricsManager::CheckForMetricIntervalTooShort(
+    const std::chrono::nanoseconds& remainder,
+    const std::chrono::nanoseconds& duration)
+{
+  if (has_given_metric_interval_warning_) {
+    return;
+  }
+  if (remainder < std::chrono::nanoseconds::zero()) {
+    std::cerr << "WARNING: Triton metrics endpoint latency ("
+              << std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+                     .count()
+              << "ms) is larger than the querying interval ("
+              << metrics_interval_ms_
+              << "ms). Please try a larger querying interval "
+                 "via `--triton-metrics-interval`."
+              << std::endl;
+    has_given_metric_interval_warning_ = true;
+  }
+}
+
+void
+MetricsManager::CheckQueryingStatus()
+{
+  if (query_loop_future_.valid() &&
+      query_loop_future_.wait_for(std::chrono::seconds(0)) ==
+          std::future_status::ready) {
+    query_loop_future_.get();
+  }
+}
+
+void
+MetricsManager::GetLatestMetrics(std::vector<Metrics>& metrics)
+{
+  if (metrics.empty() == false) {
+    throw PerfAnalyzerException(
+        "MetricsManager::GetLatestMetrics() must be passed an empty vector.",
+        GENERIC_ERROR);
+  }
+  std::lock_guard<std::mutex> metrics_lock{metrics_mutex_};
+  metrics_.swap(metrics);
+}
+
+void
+MetricsManager::StopQueryingMetrics()
+{
+  should_keep_querying_ = false;
+  query_loop_cv_.notify_one();
+  if (query_loop_future_.valid()) {
+    query_loop_future_.get();
+  }
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/metrics_manager.h
+++ b/src/c++/perf_analyzer/metrics_manager.h
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "client_backend/client_backend.h"
+#include "metrics.h"
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class TestMetricsManager;
+#endif
+
+class MetricsManager {
+ public:
+  MetricsManager(
+      std::shared_ptr<clientbackend::ClientBackend> client_backend,
+      uint64_t metrics_interval_ms);
+
+  /// Ends the background thread, redundant in case StopQueryingMetrics() isn't
+  /// called
+  ~MetricsManager();
+
+  /// Starts background thread that queries metrics on an interval
+  void StartQueryingMetrics();
+
+  /// Checks if background thread threw exception and propagates it if so
+  void CheckQueryingStatus();
+
+  /// Puts the latest-collected metrics from background thread into vector
+  /// output parameter to be used by main thread
+  void GetLatestMetrics(std::vector<Metrics>& metrics_per_timestamp);
+
+  /// Ends the background thread
+  void StopQueryingMetrics();
+
+ private:
+  void QueryMetricsEveryNMilliseconds();
+  void CheckForMissingMetrics(const Metrics& metrics);
+  void CheckForMetricIntervalTooShort(
+      const std::chrono::nanoseconds& remainder,
+      const std::chrono::nanoseconds& duration);
+
+  std::shared_ptr<clientbackend::ClientBackend> client_backend_{nullptr};
+  uint64_t metrics_interval_ms_{0};
+  std::mutex metrics_mutex_{};
+  std::vector<Metrics> metrics_{};
+  bool should_keep_querying_{false};
+  std::future<void> query_loop_future_{};
+  std::mutex query_loop_mutex_{};
+  std::condition_variable query_loop_cv_{};
+  bool has_given_missing_metrics_warning_{false};
+  bool has_given_metric_interval_warning_{false};
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend TestMetricsManager;
+
+ public:
+  MetricsManager() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/mock_concurrency_worker.h
+++ b/src/c++/perf_analyzer/mock_concurrency_worker.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include "concurrency_worker.h"
+#include "gmock/gmock.h"
+
+namespace triton { namespace perfanalyzer {
+
+class NaggyMockConcurrencyWorker : public ConcurrencyWorker {
+ public:
+  NaggyMockConcurrencyWorker(
+      uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
+      std::shared_ptr<ThreadConfig> thread_config,
+      const std::shared_ptr<ModelParser> parser,
+      std::shared_ptr<DataLoader> data_loader,
+      const std::shared_ptr<cb::ClientBackendFactory> factory,
+      const bool on_sequence_model, const bool async,
+      const size_t max_concurrency, const bool using_json_data,
+      const bool streaming, const int32_t batch_size,
+      std::condition_variable& wake_signal, std::mutex& wake_mutex,
+      size_t& active_threads, bool& execute,
+      const std::shared_ptr<IInferDataManager>& infer_data_manager,
+      std::shared_ptr<SequenceManager> sequence_manager)
+      : ConcurrencyWorker(
+            id, thread_stat, thread_config, parser, data_loader, factory,
+            on_sequence_model, async, max_concurrency, using_json_data,
+            streaming, batch_size, wake_signal, wake_mutex, active_threads,
+            execute, infer_data_manager, sequence_manager)
+  {
+    ON_CALL(*this, Infer()).WillByDefault([this]() -> void {
+      ConcurrencyWorker::Infer();
+    });
+  }
+
+  MOCK_METHOD(void, Infer, (), (override));
+
+  void EmptyInfer() { thread_config_->is_paused_ = true; }
+};
+
+// Non-naggy version of Mock (won't warn when using default gmock
+// mocked function)
+using MockConcurrencyWorker = testing::NiceMock<NaggyMockConcurrencyWorker>;
+
+}}  // namespace triton::perfanalyzer