Initial commit

c68e1835 · lijian6 · c68e1835 · c68e1835 · c68e1835 · c68e1835
Commit c68e1835 authored Sep 18, 2023 by lijian6
20 changed files
--- a/src/c++/perf_analyzer/custom_load_manager.h
+++ b/src/c++/perf_analyzer/custom_load_manager.h
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+#include "client_backend/client_backend.h"
+#include "request_rate_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class TestCustomLoadManager;
+#endif
+
+//==============================================================================
+/// CustomLoadManager is a helper class to send inference requests to
+/// inference server in accordance with  user provided time intervals. This
+/// load manager can be used to model certain patterns of interest.
+///
+class CustomLoadManager : public RequestRateManager {
+ public:
+  ~CustomLoadManager() = default;
+
+  /// Create an object of realistic load manager that is responsible to maintain
+  /// specified load on inference server.
+  /// \param async Whether to use asynchronous or synchronous API for infer
+  /// request.
+  /// \param streaming Whether to use gRPC streaming API for infer request
+  /// \param measurement_window_ms The time window for measurements.
+  /// \param max_trials The maximum number of windows that will be measured
+  /// \param request_intervals_file The path to the file to use to pick up the
+  /// time intervals between the successive requests.
+  /// \param batch_size The batch size used for each request.
+  /// \param max_threads The maximum number of working threads to be spawned.
+  /// \param num_of_sequences The number of concurrent sequences that must be
+  /// maintained on the server.
+  /// \param zero_input Whether to fill the input tensors with zero.
+  /// \param input_shapes The shape of the input tensors.
+  /// \param user_data The vector containing path/paths to user-provided data
+  /// that can be a directory or path to a json data file.
+  /// \param shared_memory_type The type of shared memory to use for inputs.
+  /// \param output_shm_size The size of the shared memory to allocate for the
+  /// output.
+  /// \param serial_sequences Enable serial sequence mode.
+  /// \param parser The ModelParser object to get the model details.
+  /// \param factory The ClientBackendFactory object used to create
+  /// client to the server.
+  /// \param manager Returns a new ConcurrencyManager object.
+  /// \return cb::Error object indicating success or failure.
+  static cb::Error Create(
+      const bool async, const bool streaming,
+      const uint64_t measurement_window_ms, const size_t max_trials,
+      const std::string& request_intervals_file, const int32_t batch_size,
+      const size_t max_threads, const uint32_t num_of_sequences,
+      const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+      const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      std::unique_ptr<LoadManager>* manager);
+
+  /// Initializes the load manager with the provided file containing request
+  /// intervals
+  /// \return cb::Error object indicating success or failure.
+  cb::Error InitCustomIntervals();
+
+  /// Computes the request rate from the time interval file. Fails with an error
+  /// if the file is not present or is empty.
+  /// \param request_rate Returns request rate as computed from the time
+  /// interval file.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error GetCustomRequestRate(double* request_rate);
+
+ private:
+  CustomLoadManager(
+      const bool async, const bool streaming,
+      const std::string& request_intervals_file, const int32_t batch_size,
+      const uint64_t measurement_window_ms, const size_t max_trials,
+      const size_t max_threads, const uint32_t num_of_sequences,
+      const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+      const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory);
+
+  cb::Error GenerateSchedule();
+
+  std::vector<RateSchedulePtr_t> CreateWorkerSchedules();
+
+  /// Reads the time intervals file and stores intervals in vector
+  /// \param path Filesystem path of the time intervals file.
+  /// \param contents Output intervals vector.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error ReadTimeIntervalsFile(
+      const std::string& path, NanoIntervals* contents);
+
+  std::string request_intervals_file_;
+  NanoIntervals custom_intervals_;
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend TestCustomLoadManager;
+
+ public:
+  CustomLoadManager() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/data_loader.cc
+++ b/src/c++/perf_analyzer/data_loader.cc
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "data_loader.h"
+
+#include <b64/decode.h>
+#include <rapidjson/filereadstream.h>
+
+#include <fstream>
+
+namespace triton { namespace perfanalyzer {
+
+DataLoader::DataLoader(const size_t batch_size)
+    : batch_size_(batch_size), data_stream_cnt_(0)
+{
+}
+
+cb::Error
+DataLoader::ReadDataFromDir(
+    const std::shared_ptr<ModelTensorMap>& inputs,
+    const std::shared_ptr<ModelTensorMap>& outputs,
+    const std::string& data_directory)
+{
+  // Directory structure supports only a single data stream and step
+  data_stream_cnt_ = 1;
+  step_num_.push_back(1);
+
+  for (const auto& input : *inputs) {
+    if (input.second.datatype_.compare("BYTES") != 0) {
+      const auto file_path = data_directory + "/" + input.second.name_;
+      std::string key_name(
+          input.second.name_ + "_" + std::to_string(0) + "_" +
+          std::to_string(0));
+      auto it = input_data_.emplace(key_name, std::vector<char>()).first;
+      RETURN_IF_ERROR(ReadFile(file_path, &it->second));
+      int64_t byte_size = ByteSize(input.second.shape_, input.second.datatype_);
+      if (byte_size < 0) {
+        return cb::Error(
+            "input " + input.second.name_ +
+                " contains dynamic shape, provide shapes to send along with "
+                "the request",
+            pa::GENERIC_ERROR);
+      }
+      if (it->second.size() != byte_size) {
+        return cb::Error(
+            "provided data for input " + input.second.name_ +
+                " has byte size " + std::to_string(it->second.size()) +
+                ", expect " + std::to_string(byte_size),
+            pa::GENERIC_ERROR);
+      }
+    } else {
+      const auto file_path = data_directory + "/" + input.second.name_;
+      std::vector<std::string> input_string_data;
+      RETURN_IF_ERROR(ReadTextFile(file_path, &input_string_data));
+      std::string key_name(
+          input.second.name_ + "_" + std::to_string(0) + "_" +
+          std::to_string(0));
+      auto it = input_data_.emplace(key_name, std::vector<char>()).first;
+      SerializeStringTensor(input_string_data, &it->second);
+      int64_t batch1_num_strings = ElementCount(input.second.shape_);
+      if (batch1_num_strings == -1) {
+        return cb::Error(
+            "input " + input.second.name_ +
+                " contains dynamic shape, provide shapes to send along with "
+                "the request",
+            pa::GENERIC_ERROR);
+      }
+      if (input_string_data.size() != batch1_num_strings) {
+        return cb::Error(
+            "provided data for input " + input.second.name_ + " has " +
+                std::to_string(input_string_data.size()) +
+                " elements, expect " + std::to_string(batch1_num_strings),
+            pa::GENERIC_ERROR);
+      }
+    }
+  }
+
+  for (const auto& output : *outputs) {
+    if (output.second.datatype_.compare("BYTES") != 0) {
+      const auto file_path = data_directory + "/" + output.second.name_;
+      std::string key_name(
+          output.second.name_ + "_" + std::to_string(0) + "_" +
+          std::to_string(0));
+      auto it = output_data_.emplace(key_name, std::vector<char>()).first;
+      if (!ReadFile(file_path, &it->second).IsOk()) {
+        output_data_.erase(it);
+      }
+    } else {
+      const auto file_path = data_directory + "/" + output.second.name_;
+      std::vector<std::string> output_string_data;
+      if (!ReadTextFile(file_path, &output_string_data).IsOk()) {
+        continue;
+      }
+      std::string key_name(
+          output.second.name_ + "_" + std::to_string(0) + "_" +
+          std::to_string(0));
+      auto it = output_data_.emplace(key_name, std::vector<char>()).first;
+      SerializeStringTensor(output_string_data, &it->second);
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ReadDataFromJSON(
+    const std::shared_ptr<ModelTensorMap>& inputs,
+    const std::shared_ptr<ModelTensorMap>& outputs,
+    const std::string& json_file)
+{
+  FILE* data_file = fopen(json_file.c_str(), "r");
+  if (data_file == nullptr) {
+    return cb::Error(
+        "failed to open file for reading provided data", pa::GENERIC_ERROR);
+  }
+
+  char readBuffer[65536];
+  rapidjson::FileReadStream fs(data_file, readBuffer, sizeof(readBuffer));
+
+  rapidjson::Document d{};
+  const unsigned int parseFlags = rapidjson::kParseNanAndInfFlag;
+  d.ParseStream<parseFlags>(fs);
+
+  fclose(data_file);
+
+  return ParseData(d, inputs, outputs);
+}
+
+cb::Error
+DataLoader::ParseData(
+    const rapidjson::Document& json,
+    const std::shared_ptr<ModelTensorMap>& inputs,
+    const std::shared_ptr<ModelTensorMap>& outputs)
+{
+  if (json.HasParseError()) {
+    std::cerr << "cb::Error  : " << json.GetParseError() << '\n'
+              << "Offset : " << json.GetErrorOffset() << '\n';
+    return cb::Error(
+        "failed to parse the specified json file for reading provided data",
+        pa::GENERIC_ERROR);
+  }
+
+  if (!json.HasMember("data")) {
+    return cb::Error(
+        "The json file doesn't contain data field", pa::GENERIC_ERROR);
+  }
+
+  const rapidjson::Value& streams = json["data"];
+
+  // Validation data is optional, once provided, it must align with 'data'
+  const rapidjson::Value* out_streams = nullptr;
+  if (json.HasMember("validation_data")) {
+    out_streams = &json["validation_data"];
+    if (out_streams->Size() != streams.Size()) {
+      return cb::Error(
+          "The 'validation_data' field doesn't align with 'data' field in the "
+          "json file",
+          pa::GENERIC_ERROR);
+    }
+  }
+
+  int count = streams.Size();
+
+  data_stream_cnt_ += count;
+  int offset = step_num_.size();
+  for (size_t i = offset; i < data_stream_cnt_; i++) {
+    const rapidjson::Value& steps = streams[i - offset];
+    const rapidjson::Value* output_steps =
+        (out_streams == nullptr) ? nullptr : &(*out_streams)[i - offset];
+
+    RETURN_IF_ERROR(ValidateParsingMode(steps));
+
+    if (steps.IsArray()) {
+      step_num_.push_back(steps.Size());
+      for (size_t k = 0; k < step_num_[i]; k++) {
+        RETURN_IF_ERROR(ReadTensorData(steps[k], inputs, i, k, true));
+      }
+
+      if (output_steps != nullptr) {
+        if (!output_steps->IsArray() ||
+            (output_steps->Size() != steps.Size())) {
+          return cb::Error(
+              "The 'validation_data' field doesn't align with 'data' field in "
+              "the json file",
+              pa::GENERIC_ERROR);
+        }
+        for (size_t k = 0; k < step_num_[i]; k++) {
+          RETURN_IF_ERROR(
+              ReadTensorData((*output_steps)[k], outputs, i, k, false));
+        }
+      }
+    } else {
+      // There is no nesting of tensors, hence, will interpret streams as steps
+      // and add the tensors to a single stream '0'.
+      int offset = 0;
+      if (step_num_.empty()) {
+        step_num_.push_back(count);
+      } else {
+        offset = step_num_[0];
+        step_num_[0] += (count);
+      }
+      data_stream_cnt_ = 1;
+      for (size_t k = offset; k < step_num_[0]; k++) {
+        RETURN_IF_ERROR(
+            ReadTensorData(streams[k - offset], inputs, 0, k, true));
+      }
+
+      if (out_streams != nullptr) {
+        for (size_t k = offset; k < step_num_[0]; k++) {
+          RETURN_IF_ERROR(
+              ReadTensorData((*out_streams)[k - offset], outputs, 0, k, false));
+        }
+      }
+      break;
+    }
+  }
+
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::GenerateData(
+    std::shared_ptr<ModelTensorMap> inputs, const bool zero_input,
+    const size_t string_length, const std::string& string_data)
+{
+  // Data generation supports only a single data stream and step
+  // Not supported for inputs with dynamic shapes
+  data_stream_cnt_ = 1;
+  step_num_.push_back(1);
+
+  // Validate the absence of shape tensors
+  for (const auto& input : *inputs) {
+    if (input.second.is_shape_tensor_) {
+      return cb::Error(
+          "can not generate data for shape tensor '" + input.second.name_ +
+              "', user-provided data is needed.",
+          pa::GENERIC_ERROR);
+    }
+  }
+
+  uint64_t max_input_byte_size = 0;
+  for (const auto& input : *inputs) {
+    if (input.second.datatype_.compare("BYTES") != 0) {
+      int64_t byte_size = ByteSize(input.second.shape_, input.second.datatype_);
+      if (byte_size < 0) {
+        return cb::Error(
+            "input " + input.second.name_ +
+                " contains dynamic shape, provide shapes to send along with "
+                "the request",
+            pa::GENERIC_ERROR);
+      }
+      max_input_byte_size = std::max(max_input_byte_size, (size_t)byte_size);
+    } else {
+      // Generate string input and store it into map
+      std::vector<std::string> input_string_data;
+      int64_t batch1_num_strings = ElementCount(input.second.shape_);
+      if (batch1_num_strings == -1) {
+        return cb::Error(
+            "input " + input.second.name_ +
+                " contains dynamic shape, provide shapes to send along with "
+                "the request",
+            pa::GENERIC_ERROR);
+      }
+      input_string_data.resize(batch1_num_strings);
+      if (!string_data.empty()) {
+        for (size_t i = 0; i < batch1_num_strings; i++) {
+          input_string_data[i] = string_data;
+        }
+      } else {
+        for (size_t i = 0; i < batch1_num_strings; i++) {
+          input_string_data[i] = GetRandomString(string_length);
+        }
+      }
+
+      std::string key_name(
+          input.second.name_ + "_" + std::to_string(0) + "_" +
+          std::to_string(0));
+      auto it = input_data_.emplace(key_name, std::vector<char>()).first;
+      SerializeStringTensor(input_string_data, &it->second);
+    }
+  }
+
+  // Create a zero or randomly (as indicated by zero_input)
+  // initialized buffer that is large enough to provide the largest
+  // needed input. We (re)use this buffer for all non-string input values.
+  if (max_input_byte_size > 0) {
+    if (zero_input) {
+      input_buf_.resize(max_input_byte_size, 0);
+    } else {
+      input_buf_.resize(max_input_byte_size);
+      for (auto& byte : input_buf_) {
+        byte = rand();
+      }
+    }
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::GetInputData(
+    const ModelTensor& input, const int stream_id, const int step_id,
+    TensorData& data)
+{
+  data.data_ptr = nullptr;
+  data.batch1_size = 0;
+  data.is_valid = false;
+
+  // If json data is available then try to retrieve the data from there
+  if (!input_data_.empty()) {
+    RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id));
+
+    std::string key_name(
+        input.name_ + "_" + std::to_string(stream_id) + "_" +
+        std::to_string(step_id));
+
+    // Get the data and the corresponding byte-size
+    auto it = input_data_.find(key_name);
+    if (it != input_data_.end()) {
+      std::vector<char>* data_vec = &it->second;
+      data.is_valid = true;
+      data.batch1_size = data_vec->size();
+      data.data_ptr = (const uint8_t*)data_vec->data();
+    }
+  }
+
+  if (!data.is_valid) {
+    if ((input.datatype_.compare("BYTES") != 0) && (input_buf_.size() != 0)) {
+      int64_t byte_size = ByteSize(input.shape_, input.datatype_);
+      if (byte_size < 0) {
+        return cb::Error(
+            "failed to get correct byte size for '" + input.name_ + "'.",
+            pa::GENERIC_ERROR);
+      }
+      data.batch1_size = (size_t)byte_size;
+      data.data_ptr = &input_buf_[0];
+      data.is_valid = true;
+    }
+  }
+
+  if (input.is_optional_ == false && !data.is_valid) {
+    return cb::Error(
+        "unable to find data for input '" + input.name_ + "'.",
+        pa::GENERIC_ERROR);
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::GetOutputData(
+    const std::string& output_name, const int stream_id, const int step_id,
+    TensorData& data)
+{
+  data.data_ptr = nullptr;
+  data.batch1_size = 0;
+  data.is_valid = false;
+
+  // If json data is available then try to retrieve the data from there
+  if (!output_data_.empty()) {
+    RETURN_IF_ERROR(ValidateIndexes(stream_id, step_id));
+
+    std::string key_name(
+        output_name + "_" + std::to_string(stream_id) + "_" +
+        std::to_string(step_id));
+    // Get the data and the corresponding byte-size
+    auto it = output_data_.find(key_name);
+    if (it != output_data_.end()) {
+      std::vector<char>* data_vec = &it->second;
+      data.is_valid = true;
+      data.batch1_size = data_vec->size();
+      data.data_ptr = (const uint8_t*)data_vec->data();
+    }
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ValidateIndexes(int stream_id, int step_id)
+{
+  if (stream_id < 0 || stream_id >= (int)data_stream_cnt_) {
+    return cb::Error(
+        "stream_id for retrieving the data should be less than " +
+            std::to_string(data_stream_cnt_) + ", got " +
+            std::to_string(stream_id),
+        pa::GENERIC_ERROR);
+  }
+  if (step_id < 0 || step_id >= (int)step_num_[stream_id]) {
+    return cb::Error(
+        "step_id for retrieving the data should be less than " +
+            std::to_string(step_num_[stream_id]) + ", got " +
+            std::to_string(step_id),
+        pa::GENERIC_ERROR);
+  }
+  return cb::Error::Success;
+}
+
+
+cb::Error
+DataLoader::GetInputShape(
+    const ModelTensor& input, const int stream_id, const int step_id,
+    std::vector<int64_t>* provided_shape)
+{
+  std::string key_name(
+      input.name_ + "_" + std::to_string(stream_id) + "_" +
+      std::to_string(step_id));
+
+  provided_shape->clear();
+
+  // Prefer the values read from file over the ones provided from
+  // CLI
+  auto it = input_shapes_.find(key_name);
+  if (it != input_shapes_.end()) {
+    *provided_shape = it->second;
+  } else {
+    *provided_shape = input.shape_;
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ReadTensorData(
+    const rapidjson::Value& step,
+    const std::shared_ptr<ModelTensorMap>& tensors, const int stream_index,
+    const int step_index, const bool is_input)
+{
+  auto& tensor_data = is_input ? input_data_ : output_data_;
+  auto& tensor_shape = is_input ? input_shapes_ : output_shapes_;
+  for (const auto& io : *tensors) {
+    if (step.HasMember(io.first.c_str())) {
+      std::string key_name(
+          io.first + "_" + std::to_string(stream_index) + "_" +
+          std::to_string(step_index));
+
+      auto it = tensor_data.emplace(key_name, std::vector<char>()).first;
+
+      const rapidjson::Value& tensor = step[(io.first).c_str()];
+
+      const rapidjson::Value* content;
+
+      // Check if the input data file is malformed
+      if (!(tensor.IsArray() || tensor.IsObject())) {
+        return cb::Error("Input data file is malformed.", pa::GENERIC_ERROR);
+      }
+
+      if (tensor.IsArray()) {
+        content = &tensor;
+      } else {
+        // Populate the shape values first if available
+        if (tensor.HasMember("shape")) {
+          auto shape_it =
+              tensor_shape.emplace(key_name, std::vector<int64_t>()).first;
+          for (const auto& value : tensor["shape"].GetArray()) {
+            if (!value.IsInt()) {
+              return cb::Error(
+                  "shape values must be integers.", pa::GENERIC_ERROR);
+            }
+            shape_it->second.push_back(value.GetInt());
+          }
+        }
+
+        if (tensor.HasMember("b64")) {
+          content = &tensor;
+        } else {
+          if (!tensor.HasMember("content")) {
+            return cb::Error(
+                "missing content field. ( Location stream id: " +
+                    std::to_string(stream_index) +
+                    ", step id: " + std::to_string(step_index) + ")",
+                pa::GENERIC_ERROR);
+          }
+
+          content = &tensor["content"];
+        }
+      }
+
+      if (content->IsArray()) {
+        RETURN_IF_ERROR(SerializeExplicitTensor(
+            *content, io.second.datatype_, &it->second));
+      } else {
+        if (content->IsObject() && content->HasMember("b64")) {
+          if ((*content)["b64"].IsString()) {
+            const std::string& encoded = (*content)["b64"].GetString();
+            it->second.resize(encoded.length());
+            base64::decoder D;
+            int size =
+                D.decode(encoded.c_str(), encoded.length(), &it->second[0]);
+            it->second.resize(size);
+          } else {
+            return cb::Error(
+                "the value of b64 field should be of type string ( "
+                "Location stream id: " +
+                    std::to_string(stream_index) +
+                    ", step id: " + std::to_string(step_index) + ")",
+                pa::GENERIC_ERROR);
+          }
+        } else {
+          return cb::Error(
+              "The tensor values are not supported. Expected an array or "
+              "b64 string ( Location stream id: " +
+                  std::to_string(stream_index) +
+                  ", step id: " + std::to_string(step_index) + ")",
+              pa::GENERIC_ERROR);
+        }
+      }
+
+      RETURN_IF_ERROR(ValidateTensor(io.second, stream_index, step_index));
+
+    } else if (io.second.is_optional_ == false) {
+      return cb::Error(
+          "missing tensor " + io.first +
+              " ( Location stream id: " + std::to_string(stream_index) +
+              ", step id: " + std::to_string(step_index) + ")",
+          pa::GENERIC_ERROR);
+    }
+  }
+
+  return cb::Error::Success;
+}
+
+
+cb::Error
+DataLoader::ReadFile(const std::string& path, std::vector<char>* contents)
+{
+  std::ifstream in(path, std::ios::in | std::ios::binary);
+  if (!in) {
+    return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
+  }
+
+  in.seekg(0, std::ios::end);
+
+  int file_size = in.tellg();
+  if (file_size > 0) {
+    contents->resize(file_size);
+    in.seekg(0, std::ios::beg);
+    in.read(&(*contents)[0], contents->size());
+  }
+
+  in.close();
+
+  // If size is invalid, report after ifstream is closed
+  if (file_size < 0) {
+    return cb::Error(
+        "failed to get size for file '" + path + "'", pa::GENERIC_ERROR);
+  } else if (file_size == 0) {
+    return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ReadTextFile(
+    const std::string& path, std::vector<std::string>* contents)
+{
+  std::ifstream in(path);
+  if (!in) {
+    return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
+  }
+
+  std::string current_string;
+  while (std::getline(in, current_string)) {
+    contents->push_back(current_string);
+  }
+  in.close();
+
+  if (contents->size() == 0) {
+    return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
+  }
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ValidateTensor(
+    const ModelTensor& model_tensor, const int stream_index,
+    const int step_index)
+{
+  std::string key_name(
+      model_tensor.name_ + "_" + std::to_string(stream_index) + "_" +
+      std::to_string(step_index));
+
+  auto data_it = input_data_.find(key_name);
+  if (data_it == input_data_.end()) {
+    data_it = output_data_.find(key_name);
+  }
+  if (data_it == output_data_.end()) {
+    return cb::Error("Can't validate a nonexistent tensor");
+  }
+
+  auto shape_it = input_shapes_.find(key_name);
+
+  const std::vector<char>& data = data_it->second;
+  const std::vector<int64_t>& shape = (shape_it == input_shapes_.end())
+                                          ? model_tensor.shape_
+                                          : shape_it->second;
+
+  int64_t batch1_byte = ByteSize(shape, model_tensor.datatype_);
+
+  RETURN_IF_ERROR(ValidateTensorShape(shape, model_tensor));
+  RETURN_IF_ERROR(ValidateTensorDataSize(data, batch1_byte, model_tensor));
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ValidateTensorShape(
+    const std::vector<int64_t>& shape, const ModelTensor& model_tensor)
+{
+  int element_count = ElementCount(shape);
+  if (element_count < 0) {
+    return cb::Error(
+        "The variable-sized tensor \"" + model_tensor.name_ +
+            "\" with model shape " + ShapeVecToString(model_tensor.shape_) +
+            " needs to have its shape fully defined. See the --shape option.",
+        pa::GENERIC_ERROR);
+  }
+
+  bool is_error = false;
+
+  if (shape.size() != model_tensor.shape_.size()) {
+    is_error = true;
+  }
+
+  for (size_t i = 0; i < shape.size() && !is_error; i++) {
+    if (shape[i] != model_tensor.shape_[i] && model_tensor.shape_[i] != -1) {
+      is_error = true;
+    }
+  }
+
+  if (is_error) {
+    return cb::Error(
+        "The supplied shape of " + ShapeVecToString(shape) + " for input \"" +
+        model_tensor.name_ +
+        "\" is incompatible with the model's input shape of " +
+        ShapeVecToString(model_tensor.shape_));
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ValidateTensorDataSize(
+    const std::vector<char>& data, int64_t batch1_byte,
+    const ModelTensor& model_tensor)
+{
+  // Validate that the supplied data matches the amount of data expected based
+  // on the shape
+  if (batch1_byte > 0 && (size_t)batch1_byte != data.size()) {
+    return cb::Error(
+        "mismatch in the data provided for " + model_tensor.name_ +
+            ". Expected: " + std::to_string(batch1_byte) +
+            " bytes, Got: " + std::to_string(data.size()) + " bytes",
+        pa::GENERIC_ERROR);
+  }
+
+  return cb::Error::Success;
+}
+
+cb::Error
+DataLoader::ValidateParsingMode(const rapidjson::Value& steps)
+{
+  // If our first time parsing data, capture the mode
+  if (step_num_.size() == 0) {
+    multiple_stream_mode_ = steps.IsArray();
+  } else {
+    if (steps.IsArray() != multiple_stream_mode_) {
+      return cb::Error(
+          "Inconsistency in input-data provided. Can not have a combination of "
+          "objects and arrays inside of the Data array",
+          pa::GENERIC_ERROR);
+    }
+  }
+  return cb::Error::Success;
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/data_loader.h
+++ b/src/c++/perf_analyzer/data_loader.h
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <fstream>
+
+#include "model_parser.h"
+#include "perf_utils.h"
+#include "tensor_data.h"
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockDataLoader;
+#endif
+
+
+class DataLoader {
+ public:
+  DataLoader(size_t batch_size);
+
+  /// Returns the total number of data streams available.
+  size_t GetDataStreamsCount() { return data_stream_cnt_; }
+
+  /// Returns the total data steps supported for a requested data stream
+  /// id.
+  /// \param stream_id The target stream id
+  virtual size_t GetTotalSteps(size_t stream_id)
+  {
+    if (stream_id < data_stream_cnt_) {
+      return step_num_[stream_id];
+    }
+    return 0;
+  }
+
+  /// Reads the input data from the specified data directory.
+  /// \param inputs The pointer to the map holding the information about
+  /// input tensors of a model
+  /// \param data_directory The path to the directory containing the data
+  cb::Error ReadDataFromDir(
+      const std::shared_ptr<ModelTensorMap>& inputs,
+      const std::shared_ptr<ModelTensorMap>& outputs,
+      const std::string& data_directory);
+
+  /// Reads the input data from the specified json file.
+  /// \param inputs The pointer to the map holding the information about
+  /// input tensors of a model
+  /// \param json_file The json file containing the user-provided input
+  /// data.
+  /// Returns error object indicating status
+  virtual cb::Error ReadDataFromJSON(
+      const std::shared_ptr<ModelTensorMap>& inputs,
+      const std::shared_ptr<ModelTensorMap>& outputs,
+      const std::string& json_file);
+
+  /// Generates the input data to use with the inference requests
+  /// \param inputs The pointer to the map holding the information about
+  /// input tensors of a model
+  /// \param zero_input Whether or not to use zero value for buffer
+  /// initialization.
+  /// \param string_length The length of the string to generate for
+  /// tensor inputs.
+  /// \param string_data The user provided string to use to populate
+  /// string tensors
+  /// Returns error object indicating status
+  cb::Error GenerateData(
+      std::shared_ptr<ModelTensorMap> inputs, const bool zero_input,
+      const size_t string_length, const std::string& string_data);
+
+  /// Helper function to access data for the specified input
+  /// \param input The target model input tensor
+  /// \param stream_id The data stream_id to use for retrieving input data.
+  /// \param step_id The data step_id to use for retrieving input data.
+  /// \param data Returns the input TensorData
+  /// Returns error object indicating status
+  cb::Error GetInputData(
+      const ModelTensor& input, const int stream_id, const int step_id,
+      TensorData& data);
+
+  /// Helper function to get the shape values to the input
+  /// \param input The target model input tensor
+  /// \param stream_id The data stream_id to use for retrieving input shape.
+  /// \param step_id The data step_id to use for retrieving input shape.
+  /// \param shape returns the pointer to the vector containing the shape
+  /// values.
+  /// Returns error object indicating status
+  cb::Error GetInputShape(
+      const ModelTensor& input, const int stream_id, const int step_id,
+      std::vector<int64_t>* shape);
+
+  /// Helper function to access data for the specified output. nullptr will be
+  /// returned if there is no data specified.
+  /// \param output_name The name of the output tensor
+  /// \param stream_id The data stream_id to use for retrieving output data.
+  /// \param step_id The data step_id to use for retrieving output data.
+  /// \param data Returns the output TensorData
+  /// Returns error object indicating status
+  cb::Error GetOutputData(
+      const std::string& output_name, const int stream_id, const int step_id,
+      TensorData& data);
+
+  /// Return an error if the stream index or step index are invalid
+  cb::Error ValidateIndexes(int stream_index, int step_index);
+
+ protected:
+  /// Parses the input and output data from the json document
+  /// \param inputs The input tensors of a model
+  /// \param outputs The output tensors of a model
+  /// \param json The json document containing the raw json inputs/outputs
+  /// \return Returns error object indicating status
+  cb::Error ParseData(
+      const rapidjson::Document& json,
+      const std::shared_ptr<ModelTensorMap>& inputs,
+      const std::shared_ptr<ModelTensorMap>& outputs);
+
+ private:
+  /// Reads the data from file specified by path into vector of characters
+  /// \param path The complete path to the file to be read
+  /// \param contents The character vector that will contain the data read
+  /// \return error status. Returns Non-Ok if an error is encountered during
+  ///  read operation.
+  virtual cb::Error ReadFile(
+      const std::string& path, std::vector<char>* contents);
+
+  /// Reads the string from file specified by path into vector of strings
+  /// \param path The complete path to the file to be read
+  /// \param contents The string vector that will contain the data read
+  /// \return error status. Returns Non-Ok if an error is encountered during
+  ///  read operation.
+  virtual cb::Error ReadTextFile(
+      const std::string& path, std::vector<std::string>* contents);
+
+  /// Helper function to read data for the specified input from json
+  /// \param step the DOM for current step
+  /// \param inputs The pointer to the map holding the information about
+  /// input tensors of a model
+  /// \param stream_index the stream index the data should be exported to.
+  /// \param step_index the step index the data should be exported to.
+  /// Returns error object indicating status
+  cb::Error ReadTensorData(
+      const rapidjson::Value& step,
+      const std::shared_ptr<ModelTensorMap>& tensors, const int stream_index,
+      const int step_index, const bool is_input);
+
+  /// Helper function to validate the provided data and shape for the tensor
+  /// \param input The target model input or output tensor
+  /// \param stream_index the stream index the data should be exported to.
+  /// \param step_index the step index the data should be exported to.
+  /// Returns error object indicating status
+  cb::Error ValidateTensor(
+      const ModelTensor& model_tensor, const int stream_index,
+      const int step_index);
+
+  /// Helper function to validate the provided shape for a tensor
+  /// \param shape Shape for the tensor
+  /// \param model_tensor The tensor to validate
+  /// Returns error object indicating status
+  cb::Error ValidateTensorShape(
+      const std::vector<int64_t>& shape, const ModelTensor& model_tensor);
+
+  /// Helper function to validate the provided data's size
+  /// \param data The provided data for the tensor
+  /// \param batch1_byte The expected number of bytes of data
+  /// \param model_tensor The tensor to validate
+  /// Returns error object indicating status
+  cb::Error ValidateTensorDataSize(
+      const std::vector<char>& data, int64_t batch1_byte,
+      const ModelTensor& model_tensor);
+
+  /// Helper function to validate consistency of parsing mode for provided input
+  /// data.  The code explicitly does not support a mixture of objects (multiple
+  /// entries of a single stream) and arrays (multiple streams)
+  ///
+  /// \param steps The json data provided for one or multiple streams
+  cb::Error ValidateParsingMode(const rapidjson::Value& steps);
+
+  // The batch_size_ for the data
+  size_t batch_size_{1};
+  // The total number of data streams available.
+  size_t data_stream_cnt_{0};
+  // A vector containing the supported step number for respective stream
+  // ids.
+  std::vector<size_t> step_num_;
+
+  // User provided input data, it will be preferred over synthetic data
+  std::unordered_map<std::string, std::vector<char>> input_data_;
+  std::unordered_map<std::string, std::vector<int64_t>> input_shapes_;
+
+  // User provided output data for validation
+  std::unordered_map<std::string, std::vector<char>> output_data_;
+  std::unordered_map<std::string, std::vector<int64_t>> output_shapes_;
+
+  // Placeholder for generated input data, which will be used for all inputs
+  // except string
+  std::vector<uint8_t> input_buf_;
+
+  // Tracks what type of input data has been provided
+  bool multiple_stream_mode_ = false;
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockDataLoader;
+
+ public:
+  DataLoader() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/docs/README.md
+++ b/src/c++/perf_analyzer/docs/README.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# **Perf Analyzer Documentation**
+
+| [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) |
+| -------------------------------------- | -------------------------------------------- | ---------------------------------- |
+
+## **Installation**
+
+See the [Installation Guide](install.md) for details on how to install Perf
+Analyzer.
+
+## **Getting Started**
+
+The [Quick Start Guide](quick_start.md) will show you how to use Perf
+Analyzer to profile a simple PyTorch model.
+
+## **User Guide**
+
+The User Guide describes the Perf Analyzer command line options, how to specify
+model input data, the performance measurement modes, the performance metrics and
+outputs, how to benchmark different servers, and more.
+
+- [Perf Analyzer CLI](cli.md)
+- [Inference Load Modes](inference_load_modes.md)
+- [Input Data](input_data.md)
+- [Measurements & Metrics](measurements_metrics.md)
+- [Benchmarking](benchmarking.md)
--- a/src/c++/perf_analyzer/docs/benchmarking.md
+++ b/src/c++/perf_analyzer/docs/benchmarking.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Benchmarking Triton via HTTP or gRPC endpoint
+
+This is the default mode for Perf Analyzer.
+
+# Benchmarking Triton directly via C API
+
+Besides using HTTP or gRPC server endpoints to communicate with Triton, Perf
+Analyzer also allows users to benchmark Triton directly using the C API. HTTP
+and gRPC endpoints introduce an additional latency in the pipeline which may not
+be of interest to users who are using Triton via C API within their application.
+Specifically, this feature is useful to benchmark a bare minimum Triton without
+additional overheads from HTTP/gRPC communication.
+
+## Prerequisite
+
+Pull the Triton SDK and the Triton Server container images on target machine.
+Since you will need access to the `tritonserver` install, it might be easier if
+you copy the `perf_analyzer` binary to the Inference Server container.
+
+## Required parameters
+
+Use the [`--help`](cli.md#--help) option to see a complete list of supported
+command line arguments. By default, Perf Analyzer expects the Triton instance to
+already be running. You can configure C API mode using the
+[`--service-kind`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
+option. In addition, you will need to point Perf Analyzer to the Triton server
+library path using the
+[`--triton-server-directory`](cli.md#--triton-server-directorypath) option and
+the model repository path using the
+[`--model-repository`](cli.md#--model-repositorypath) option.
+
+An example run would look like:
+
+```
+$ perf_analyzer -m my_model --service-kind=triton_c_api --triton-server-directory=/opt/tritonserver --model-repository=/my/model/repository
+...
+*** Measurement Settings ***
+  Service Kind: Triton C-API
+  Using "time_windows" mode for stabilization
+  Measurement window: 5000 msec
+  Using synchronous calls for inference
+  Stabilizing using average latency
+
+Request concurrency: 1
+  Client:
+    Request count: 353
+    Throughput: 19.6095 infer/sec
+    Avg latency: 50951 usec (standard deviation 2265 usec)
+    p50 latency: 50833 usec
+    p90 latency: 50923 usec
+    p95 latency: 50940 usec
+    p99 latency: 50985 usec
+
+  Server:
+    Inference count: 353
+    Execution count: 353
+    Successful request count: 353
+    Avg request latency: 50841 usec (overhead 20 usec + queue 63 usec + compute input 35 usec + compute infer 50663 usec + compute output 59 usec)
+
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 1, throughput: 19.6095 infer/sec, latency 50951 usec
+```
+
+## Non-supported functionalities
+
+There are a few functionalities that are missing from C API mode. They are:
+
+1. Async mode ([`--async`](cli.md#--async))
+2. For additional known non-working cases, please refer to
+   [qa/L0_perf_analyzer_capi/test.sh](https://github.com/triton-inference-server/server/blob/main/qa/L0_perf_analyzer_capi/test.sh#L239-L277)
+
+# Benchmarking TensorFlow Serving
+
+Perf Analyzer can also be used to benchmark models deployed on
+[TensorFlow Serving](https://github.com/tensorflow/serving) using the
+[`--service-kind=tfserving`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
+option. Only gRPC protocol is supported.
+
+The following invocation demonstrates how to configure Perf Analyzer to issue
+requests to a running instance of `tensorflow_model_server`:
+
+```
+$ perf_analyzer -m resnet50 --service-kind tfserving -i grpc -b 1 -p 5000 -u localhost:8500
+*** Measurement Settings ***
+  Batch size: 1
+  Using "time_windows" mode for stabilization
+  Measurement window: 5000 msec
+  Using synchronous calls for inference
+  Stabilizing using average latency
+Request concurrency: 1
+  Client:
+    Request count: 829
+    Throughput: 165.8 infer/sec
+    Avg latency: 6032 usec (standard deviation 569 usec)
+    p50 latency: 5863 usec
+    p90 latency: 6655 usec
+    p95 latency: 6974 usec
+    p99 latency: 8093 usec
+    Avg gRPC time: 5984 usec ((un)marshal request/response 257 usec + response wait 5727 usec)
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 1, throughput: 165.8 infer/sec, latency 6032 usec
+```
+
+You might have to specify a different url ([`-u`](cli.md#-u-url)) to access
+wherever the server is running. The report of Perf Analyzer will only include
+statistics measured at the client-side.
+
+**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee
+optimal tuning for TensorFlow Serving. However, a single benchmarking tool that
+can be used to stress the inference servers in an identical manner is important
+for performance analysis.
+
+The following points are important for interpreting the results:
+
+1. `Concurrent Request Execution`:
+   TensorFlow Serving (TFS), as of version 2.8.0, by default creates threads for
+   each request that individually submits requests to TensorFlow Session. There
+   is a resource limit on the number of concurrent threads serving requests.
+   When benchmarking at a higher request concurrency, you can see higher
+   throughput because of this. Unlike TFS, by default Triton is configured with
+   only a single
+   [instance count](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#instance-groups).
+   Hence, at a higher request concurrency, most of the requests are blocked on
+   the instance availability. To configure Triton to behave like TFS, set the
+   instance count to a reasonably high value and then set
+   [MAX_SESSION_SHARE_COUNT](https://github.com/triton-inference-server/tensorflow_backend#parameters)
+   parameter in the model `config.pbtxt` to the same value. For some context,
+   the TFS sets its thread constraint to four times the num of schedulable CPUs.
+2. `Different library versions`:
+   The version of TensorFlow might differ between Triton and TensorFlow Serving
+   being benchmarked. Even the versions of CUDA libraries might differ between
+   the two solutions. The performance of models can be susceptible to the
+   versions of these libraries. For a single request concurrency, if the
+   `compute_infer` time reported by Perf Analyzer when benchmarking Triton is as
+   large as the latency reported by Perf Analyzer when benchmarking TFS, then
+   the performance difference is likely because of the difference in the
+   software stack and outside the scope of Triton.
+3. `CPU Optimization`:
+   TFS has separate builds for CPU and GPU targets. They have target-specific
+   optimization. Unlike TFS, Triton has a single build which is optimized for
+   execution on GPUs. When collecting performance on CPU models on Triton, try
+   running Triton with the environment variable `TF_ENABLE_ONEDNN_OPTS=1`.
+
+# Benchmarking TorchServe
+
+Perf Analyzer can also be used to benchmark
+[TorchServe](https://github.com/pytorch/serve) using the
+[`--service-kind=torchserve`](cli.md#--service-kindtritontriton_c_apitfservingtorchserve)
+option. Only HTTP protocol is supported. It also requires input to be provided
+via JSON file.
+
+The following invocation demonstrates how to configure Perf Analyzer to issue
+requests to a running instance of `torchserve` assuming the location holds
+`kitten_small.jpg`:
+
+```
+$ perf_analyzer -m resnet50 --service-kind torchserve -i http -u localhost:8080 -b 1 -p 5000 --input-data data.json
+ Successfully read data for 1 stream/streams with 1 step/steps.
+*** Measurement Settings ***
+  Batch size: 1
+  Using "time_windows" mode for stabilization
+  Measurement window: 5000 msec
+  Using synchronous calls for inference
+  Stabilizing using average latency
+Request concurrency: 1
+  Client:
+    Request count: 799
+    Throughput: 159.8 infer/sec
+    Avg latency: 6259 usec (standard deviation 397 usec)
+    p50 latency: 6305 usec
+    p90 latency: 6448 usec
+    p95 latency: 6494 usec
+    p99 latency: 7158 usec
+    Avg HTTP time: 6272 usec (send/recv 77 usec + response wait 6195 usec)
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 1, throughput: 159.8 infer/sec, latency 6259 usec
+```
+
+The content of `data.json`:
+
+```json
+ {
+   "data" :
+    [
+       {
+         "TORCHSERVE_INPUT" : ["kitten_small.jpg"]
+       }
+     ]
+ }
+```
+
+You might have to specify a different url ([`-u`](cli.md#-u-url)) to access
+wherever the server is running. The report of Perf Analyzer will only include
+statistics measured at the client-side.
+
+**NOTE:** The support is still in **beta**. Perf Analyzer does not guarantee
+optimal tuning for TorchServe. However, a single benchmarking tool that can be
+used to stress the inference servers in an identical manner is important for
+performance analysis.
+
+# Advantages of using Perf Analyzer over third-party benchmark suites
+
+Triton Inference Server offers the entire serving solution which includes
+[client libraries](https://github.com/triton-inference-server/client) that are
+optimized for Triton. Using third-party benchmark suites like `jmeter` fails to
+take advantage of the optimized libraries. Some of these optimizations includes
+but are not limited to:
+
+1. Using
+   [binary tensor data extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#binary-tensor-data-extension)
+   with HTTP requests.
+2. Effective re-use of gRPC message allocation in subsequent requests.
+3. Avoiding extra memory copy via libcurl interface.
+
+These optimizations can have a tremendous impact on overall performance. Using
+Perf Analyzer for benchmarking directly allows a user to access these
+optimizations in their study.
+
+Not only that, Perf Analyzer is also very customizable and supports many Triton
+features as described in this document. This, along with a detailed report,
+allows a user to identify performance bottlenecks and experiment with different
+features before deciding upon what works best for them.
--- a/src/c++/perf_analyzer/docs/cli.md
+++ b/src/c++/perf_analyzer/docs/cli.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Perf Analyzer CLI
+
+This document details the Perf Analyzer command line interface:
+
+- [General Options](#general-options)
+- [Measurement Options](#measurement-options)
+- [Sequence Model Options](#sequence-model-options)
+- [Input Data Options](#input-data-options)
+- [Request Options](#request-options)
+- [Server Options](#server-options)
+- [Prometheus Metrics Options](#prometheus-metrics-options)
+- [Report Options](#report-options)
+- [Trace Options](#trace-options)
+- [Deprecated Options](#deprecated-options)
+
+## General Options
+
+#### `-?`
+#### `-h`
+#### `--help`
+
+Prints a description of the Perf Analyzer command line interface.
+
+#### `-m <string>`
+
+Specifies the model name for Perf Analyzer to run.
+
+This is a required option.
+
+#### `-x <string>`
+
+Specifies the version of the model to be used. If not specified the most
+recent version (the highest numbered version) of the model will be used.
+
+#### `--service-kind=[triton|triton_c_api|tfserving|torchserve]`
+
+Specifies the kind of service for Perf Analyzer to generate load for. Note: in
+order to use `torchserve` backend, the `--input-data` option must point to a
+JSON file holding data in the following format:
+
+```
+{
+  "data": [
+    {
+      "TORCHSERVE_INPUT": [
+        "<complete path to the content file>"
+      ]
+    },
+    {...},
+    ...
+  ]
+}
+```
+
+The type of file here will depend on the model. In order to use `triton_c_api`
+you must specify the Triton server install path and the model repository path
+via the `--triton-server-directory` and `--model-repository` options.
+
+Default is `triton`.
+
+#### `--bls-composing-models=<string>`
+
+Specifies the list of all BLS composing models as a comma separated list of
+model names (with optional model version number after a colon for each) that may
+be called by the input BLS model. For example,
+`--bls-composing-models=modelA:3,modelB` would specify that modelA and modelB
+are composing models that may be called by the input BLS model, and that modelA
+will use version 3, while modelB's version is unspecified.
+
+#### `--model-signature-name=<string>`
+
+Specifies the signature name of the saved model to use.
+
+Default is `serving_default`. This option will be ignored if `--service-kind`
+is not `tfserving`.
+
+#### `-v`
+
+Enables verbose mode. May be specified an additional time (`-v -v`) to enable
+extra verbose mode.
+
+## Measurement Options
+
+#### `--measurement-mode=[time_windows|count_windows]`
+
+Specifies the mode used for stabilizing measurements. 'time_windows' will
+create windows such that the duration of each window is equal to
+`--measurement-interval`. 'count_windows' will create windows such that there
+are at least `--measurement-request-count` requests in each window and that
+the window is at least one second in duration (adding more requests if
+necessary).
+
+Default is `time_windows`.
+
+#### `-p <n>`
+#### `--measurement-interval=<n>`
+
+Specifies the time interval used for each measurement in milliseconds when
+`--measurement-mode=time_windows` is used. Perf Analyzer will sample a time
+interval specified by this option and take measurement over the requests
+completed within that time interval.
+
+Default is `5000`.
+
+#### `--measurement-request-count=<n>`
+
+Specifies the minimum number of requests to be collected in each measurement
+window when `--measurement-mode=count_windows` is used.
+
+Default is `50`.
+
+#### `-s <n>`
+#### `--stability-percentage=<n>`
+
+Specifies the allowed variation in latency measurements when determining if a
+result is stable. The measurement is considered stable if the ratio of max /
+min from the recent 3 measurements is within (stability percentage)% in terms
+of both inferences per second and latency.
+
+Default is `10`(%).
+
+#### `--percentile=<n>`
+
+Specifies the confidence value as a percentile that will be used to determine
+if a measurement is stable. For example, a value of `85` indicates that the
+85th percentile latency will be used to determine stability. The percentile
+will also be reported in the results.
+
+Default is `-1` indicating that the average latency is used to determine
+stability.
+
+#### `-r <n>`
+#### `--max-trials=<n>`
+
+Specifies the maximum number of measurements when attempting to reach stability
+of inferences per second and latency for each concurrency or request rate
+during the search. Perf Analyzer will terminate if the measurement is still
+unstable after the maximum number of trials.
+
+Default is `10`.
+
+#### `--concurrency-range=<start:end:step>`
+
+Specifies the range of concurrency levels covered by Perf Analyzer. Perf
+Analyzer will start from the concurrency level of 'start' and go until 'end'
+with a stride of 'step'.
+
+Default of 'end' and 'step' are `1`. If 'end' is not specified then Perf
+Analyzer will run for a single concurrency level determined by 'start'. If
+'end' is set as `0`, then the concurrency limit will be incremented by 'step'
+until the latency threshold is met. 'end' and `--latency-threshold` cannot
+both be `0`. 'end' cannot be `0` for sequence models while using asynchronous
+mode.
+
+#### `--request-rate-range=<start:end:step>`
+
+Specifies the range of request rates for load generated by Perf Analyzer. This
+option can take floating-point values. The search along the request rate range
+is enabled only when using this option.
+
+If not specified, then Perf Analyzer will search along the concurrency range.
+Perf Analyzer will start from the request rate of 'start' and go until 'end'
+with a stride of 'step'. Default values of 'start', 'end' and 'step' are all
+`1.0`. If 'end' is not specified, then Perf Analyzer will run for a single
+request rate as determined by 'start'. If 'end' is set as `0.0`, then the
+request rate will be incremented by 'step' until the latency threshold is met.
+'end' and `--latency-threshold` can not be both `0`.
+
+#### `--request-distribution=[constant|poisson]`
+
+Specifies the time interval distribution between dispatching inference requests
+to the server. Poisson distribution closely mimics the real-world work load on
+a server. This option is ignored if not using `--request-rate-range`.
+
+Default is `constant`.
+
+#### `-l <n>`
+#### `--latency-threshold=<n>`
+
+Specifies the limit on the observed latency, in milliseconds. Perf Analyzer
+will terminate the concurrency or request rate search once the measured latency
+exceeds this threshold.
+
+Default is `0` indicating that Perf Analyzer will run for the entire
+concurrency or request rate range.
+
+#### `--binary-search`
+
+Enables binary search on the specified search range (concurrency or request
+rate). This option requires 'start' and 'end' to be expilicitly specified in
+the concurrency range or request rate range. When using this option, 'step' is
+more like the precision. When the 'step' is lower, there are more iterations
+along the search path to find suitable convergence.
+
+When `--binary-search` is not specified, linear search is used.
+
+#### `--request-intervals=<path>`
+
+Specifies a path to a file containing time intervals in microseconds. Each time
+interval should be in a new line. Perf Analyzer will try to maintain time
+intervals between successive generated requests to be as close as possible in
+this file. This option can be used to apply custom load to server with a
+certain pattern of interest. Perf Analyzer will loop around the file if the
+duration of execution exceeds the amount of time specified by the intervals.
+This option can not be used with `--request-rate-range` or
+`--concurrency-range`.
+
+#### `--max-threads=<n>`
+
+Specifies the maximum number of threads that will be created for providing
+desired concurrency or request rate. However, when running in synchronous mode
+with `--concurrency-range` having explicit 'end' specification, this value will
+be ignored.
+
+Default is `4` if `--request-rate-range` is specified, otherwise default is
+`16`.
+
+## Sequence Model Options
+
+#### `--num-of-sequences=<n>`
+
+Specifies the number of concurrent sequences for sequence models. This option
+is ignored when `--request-rate-range` is not specified.
+
+Default is `4`.
+
+#### `--sequence-length=<n>`
+
+Specifies the base length of a sequence used for sequence models. A sequence
+with length X will be composed of X requests to be sent as the elements in the
+sequence. The actual length of the sequencewill be within +/- Y% of the base
+length, where Y defaults to 20% and is customizable via
+`--sequence-length-variation`. If sequence length is unspecified and input data
+is provided, the sequence length will be the number of inputs in the
+user-provided input data.
+
+Default is `20`.
+
+#### `--sequence-length-variation=<n>`
+
+Specifies the percentage variation in length of sequences. This option is only
+valid when not using user-provided input data or when `--sequence-length` is
+specified while using user-provided input data.
+
+Default is `20`(%).
+
+#### `--sequence-id-range=<start:end>`
+
+Specifies the range of sequence IDs used by Perf Analyzer. Perf Analyzer will
+start from the sequence ID of 'start' and go until 'end' (excluded). If 'end'
+is not specified then Perf Analyzer will generate new sequence IDs without
+bounds. If 'end' is specified and the concurrency setting may result in
+maintaining a number of sequences more than the range of available sequence
+IDs, Perf Analyzer will exit with an error due to possible sequence ID
+collisions.
+
+The default for 'start is `1`, and 'end' is not specified (no bounds).
+
+#### `--serial-sequences`
+
+Enables the serial sequence mode where a maximum of one request is live per sequence.
+Note: It is possible that this mode can cause the request rate mode to not achieve the
+desired rate, especially if num-of-sequences is too small.
+
+## Input Data Options
+
+#### `--input-data=[zero|random|<path>]`
+
+Specifies type of data that will be used for input in inference requests. The
+available options are `zero`, `random`, and a path to a directory or a JSON
+file.
+
+When pointing to a JSON file, the user must adhere to the format described in
+the [input data documentation](input_data.md). By specifying JSON data, users
+can control data used with every request. Multiple data streams can be specified
+for a sequence model, and Perf Analyzer will select a data stream in a
+round-robin fashion for every new sequence. Multiple JSON files can also be
+provided (`--input-data json_file1.json --input-data json_file2.json` and so on)
+and Perf Analyzer will append data streams from each file. When using
+`--service-kind=torchserve`, make sure this option points to a JSON file.
+
+If the option is path to a directory then the directory must contain a binary
+text file for each non-string/string input respectively, named the same as the
+input. Each file must contain the data required for that input for a batch-1
+request. Each binary file should contain the raw binary representation of the
+input in row-major order for non-string inputs. The text file should contain
+all strings needed by batch-1, each in a new line, listed in row-major order.
+
+Default is `random`.
+
+#### `-b <n>`
+
+Specifies the batch size for each request sent.
+
+Default is `1`.
+
+#### `--shape=<string>`
+
+Specifies the shape used for the specified input. The argument must be
+specified as 'name:shape' where the shape is a comma-separated list for
+dimension sizes. For example `--shape=input_name:1,2,3` indicates that the
+input `input_name` has tensor shape [ 1, 2, 3 ]. `--shape` may be specified
+multiple times to specify shapes for different inputs.
+
+#### `--string-data=<string>`
+
+Specifies the string to initialize string input buffers. Perf Analyzer will
+replicate the given string to build tensors of required shape.
+`--string-length` will not have any effect. This option is ignored if
+`--input-data` points to a JSON file or directory.
+
+#### `--string-length=<n>`
+
+Specifies the length of the random strings to be generated by Perf Analyzer
+for string input. This option is ignored if `--input-data` points to a
+JSON file or directory.
+
+Default is `128`.
+
+#### `--shared-memory=[none|system|cuda]`
+
+Specifies the type of the shared memory to use for input and output data.
+
+Default is `none`.
+
+#### `--output-shared-memory-size=<n>`
+
+Specifies The size, in bytes, of the shared memory region to allocate per
+output tensor. Only needed when one or more of the outputs are of string type
+and/or variable shape. The value should be larger than the size of the largest
+output tensor that the model is expected to return. Perf Analyzer will use the
+following formula to calculate the total shared memory to allocate:
+output_shared_memory_size * number_of_outputs * batch_size.
+
+Default is `102400` (100 KB).
+
+#### `--input-tensor-format=[binary|json]`
+
+Specifies the Triton inference request input tensor format. Only valid when HTTP
+protocol is used.
+
+Default is `binary`.
+
+#### `--output-tensor-format=[binary|json]`
+
+Specifies the Triton inference response output tensor format. Only valid when
+HTTP protocol is used.
+
+Default is `binary`.
+
+## Request Options
+
+#### `-i [http|grpc]`
+
+Specifies the communication protocol to use. The available protocols are HTTP
+and gRPC.
+
+Default is `http`.
+
+#### `-a`
+#### `--async`
+
+Enables asynchronous mode in Perf Analyzer.
+
+By default, Perf Analyzer will use a synchronous request API for inference.
+However, if the model is sequential, then the default mode is asynchronous.
+Specify `--sync` to operate sequential models in synchronous mode. In
+synchronous mode, Perf Analyzer will start threads equal to the concurrency
+level. Use asynchronous mode to limit the number of threads, yet maintain the
+concurrency.
+
+#### `--sync`
+
+Enables synchronous mode in Perf Analyzer. Can be used to operate Perf
+Analyzer with sequential model in synchronous mode.
+
+#### `--streaming`
+
+Enables the use of streaming API. This option is only valid with gRPC protocol.
+
+#### `-H <string>`
+
+Specifies the header that will be added to HTTP requests (ignored for gRPC
+requests). The header must be specified as 'Header:Value'. `-H` may be
+specified multiple times to add multiple headers.
+
+#### `--grpc-compression-algorithm=[none|gzip|deflate]`
+
+Specifies the compression algorithm to be used by gRPC when sending requests.
+Only supported when gRPC protocol is being used.
+
+Default is `none`.
+
+## Server Options
+
+#### `-u <url>`
+
+Specifies the URL for the server.
+
+Default is `localhost:8000` when using `--service-kind=triton` with HTTP.
+Default is `localhost:8001` when using `--service-kind=triton` with gRPC.
+Default is `localhost:8500` when using `--service-kind=tfserving`.
+
+#### `--ssl-grpc-use-ssl`
+
+Enables usage of an encrypted channel to the server.
+
+#### `--ssl-grpc-root-certifications-file=<path>`
+
+Specifies the path to file containing the PEM encoding of the server root
+certificates.
+
+#### `--ssl-grpc-private-key-file=<path>`
+
+Specifies the path to file containing the PEM encoding of the client's private
+key.
+
+#### `--ssl-grpc-certificate-chain-file=<path>`
+
+Specifies the path to file containing the PEM encoding of the client's
+certificate chain.
+
+#### `--ssl-https-verify-peer=[0|1]`
+
+Specifies whether to verify the peer's SSL certificate. See
+https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html for the meaning of each
+value.
+
+Default is `1`.
+
+#### `--ssl-https-verify-host=[0|1|2]`
+
+Specifies whether to verify the certificate's name against host. See
+https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html for the meaning of each
+value.
+
+Default is `2`.
+
+#### `--ssl-https-ca-certificates-file=<path>`
+
+Specifies the path to Certificate Authority (CA) bundle.
+
+#### `--ssl-https-client-certificate-file=<path>`
+
+Specifies the path to the SSL client certificate.
+
+#### `--ssl-https-client-certificate-type=[PEM|DER]`
+
+Specifies the type of the client SSL certificate.
+
+Default is `PEM`.
+
+#### `--ssl-https-private-key-file=<path>`
+
+Specifies the path to the private keyfile for TLS and SSL client cert.
+
+#### `--ssl-https-private-key-type=[PEM|DER]`
+
+Specifies the type of the private key file.
+
+Default is `PEM`.
+
+#### `--triton-server-directory=<path>`
+
+Specifies the Triton server install path. Required by and only used when C API
+is used (`--service-kind=triton_c_api`).
+
+Default is `/opt/tritonserver`.
+
+#### `--model-repository=<path>`
+
+Specifies the model repository directory path for loading models. Required by
+and only used when C API is used (`--service-kind=triton_c_api`).
+
+## Prometheus Metrics Options
+
+#### `--collect-metrics`
+
+Enables the collection of server-side inference server metrics. Perf Analyzer
+will output metrics in the CSV file generated with the `-f` option. Only valid
+when `--verbose-csv` option also used.
+
+#### `--metrics-url=<url>`
+
+Specifies the URL to query for server-side inference server metrics.
+
+Default is `localhost:8002/metrics`.
+
+#### `--metrics-interval=<n>`
+
+Specifies how often within each measurement window, in milliseconds, Perf
+Analyzer should query for server-side inference server metrics.
+
+Default is `1000`.
+
+## Report Options
+
+#### `-f <path>`
+
+Specifies the path that the latency report file will be generated at.
+
+When `-f` is not specified, a latency report will not be generated.
+
+#### `--profile-export-file <path>`
+
+Specifies the path that the profile export will be generated at.
+
+When `--profile-export-file` is not specified, a profile export will not be
+generated.
+
+#### `--verbose-csv`
+
+Enables additional information being output to the CSV file generated by Perf
+Analyzer.
+
+## Trace Options
+
+#### `--trace-file=<path>`
+
+Specifies the file where trace output will be saved.
+
+If `--log-frequency` is also specified, this argument value will be the
+prefix of the files to save the trace output. See `--log-frequency` for
+details. Only used for `--service-kind=triton`.
+
+#### `--trace-level=[OFF|TIMESTAMPS|TENSORS]`
+
+Specifies a trace level. `OFF` disables tracing. `TIMESTAMPS` traces
+timestamps. `TENSORS` traces tensors. It may be specified multiple times to
+trace multiple information.
+
+Default is `OFF`.
+
+#### `--trace-rate=<n>`
+
+Specifies the trace sampling rate (traces per second).
+
+Default is `1000`.
+
+#### `--trace-count=<n>`
+
+Specifies the number of traces to be sampled. If the value is `-1`, the number
+of traces to be sampled will not be limited.
+
+Default is `-1`.
+
+#### `--log-frequency=<n>`
+
+Specifies the trace log frequency. If the value is `0`, Triton will only log
+the trace output to path specified via `--trace-file` when shutting down.
+Otherwise, Triton will log the trace output to the path specified via
+`--trace-file`.<idx> when it collects the specified number of traces. For
+example, if `--trace-file` is specified to be `trace_file.log`, and if the log
+frequency is `100`, when Triton collects the 100th trace, it logs the traces
+to file `trace_file.log.0`, and when it collects the 200th trace, it logs the
+101st to the 200th traces to file `trace_file.log.1`.
+
+Default is `0`.
+
+## Deprecated Options
+
+#### `--data-directory=<path>`
+
+**DEPRECATED**
+
+Alias for `--input-data=<path>` where `<path>` is the path to a directory. See
+`--input-data` option documentation for details.
+
+#### `-c <n>`
+
+**DEPRECATED**
+
+Specifies the maximum concurrency that Perf Analyzer will search up to. Cannot
+be used with `--concurrency-range`.
+
+#### `-d`
+
+**DEPRECATED**
+
+Enables dynamic concurrency mode. Perf Analyzer will search along
+concurrencies up to the maximum concurrency specified via `-c <n>`. Cannot be
+used with `--concurrency-range`.
+
+#### `-t <n>`
+
+**DEPRECATED**
+
+Specifies the number of concurrent requests. Cannot be used with
+`--concurrency-range`.
+
+Default is `1`.
+
+#### `-z`
+
+**DEPRECATED**
+
+Alias for `--input-data=zero`. See `--input-data` option documentation for
+details.
--- a/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
+++ b/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
+{
+    "experiments": [
+        {
+            "experiment": {
+                "mode": "concurrency",
+                "value": 4
+            },
+            "requests": [
+                {
+                    "timestamp": 1,
+                    "sequence_id": 1,
+                    "response_timestamps": [
+                        2,
+                        3,
+                        4
+                    ]
+                },
+                {
+                    "timestamp": 5,
+                    "sequence_id": 2,
+                    "response_timestamps": []
+                },
+                {
+                    "timestamp": 6,
+                    "sequence_id": 2,
+                    "response_timestamps": [
+                        7,
+                        8,
+                        9
+                    ]
+                }
+            ],
+            "window_boundaries": [
+                1,
+                5,
+                6
+            ]
+        }
+    ],
+    "version": "1.2.3"
+}
--- a/src/c++/perf_analyzer/docs/inference_load_modes.md
+++ b/src/c++/perf_analyzer/docs/inference_load_modes.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Inference Load Modes
+
+Perf Analyzer has several modes for generating inference request load for a
+model.
+
+## Concurrency Mode
+
+In concurrency mode, Perf Analyzer attempts to send inference requests to the
+server such that N requests are always outstanding during profiling. For
+example, when using
+[`--concurrency-range=4`](cli.md#--concurrency-rangestartendstep), Perf Analyzer
+will to attempt to have 4 outgoing inference requests at all times during
+profiling.
+
+## Request Rate Mode
+
+In request rate mode, Perf Analyzer attempts to send N inference requests per
+second to the server during profiling. For example, when using
+[`--request-rate-range=20`](cli.md#--request-rate-rangestartendstep), Perf
+Analyzer will attempt to send 20 requests per second during profiling.
+
+## Custom Interval Mode
+
+In custom interval mode, Perf Analyzer attempts to send inference requests
+according to intervals (between requests, looping if necessary) provided by the
+user in the form of a text file with one time interval (in microseconds) per
+line. For example, when using
+[`--request-intervals=my_intervals.txt`](cli.md#--request-intervalspath),
+where `my_intervals.txt` contains:
+
+```
+100000
+200000
+500000
+```
+
+Perf Analyzer will attempt to send requests at the following times: 0.1s, 0.3s,
+0.8s, 0.9s, 1.1s, 1.6s, and so on, during profiling.
--- a/src/c++/perf_analyzer/docs/input_data.md
+++ b/src/c++/perf_analyzer/docs/input_data.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Input Data
+
+Use the [`--help`](cli.md#--help) option to see complete documentation for all
+input data options. By default Perf Analyzer sends random data to all the inputs
+of your model. You can select a different input data mode with the
+[`--input-data`](cli.md#--input-datazerorandompath) option:
+
+- _random_: (default) Send random data for each input. Note: Perf Analyzer only
+  generates random data once per input and reuses that for all inferences
+- _zero_: Send zeros for each input.
+- directory path: A path to a directory containing a binary file for each input,
+  named the same as the input. Each binary file must contain the data required
+  for that input for a batch-1 request. Each file should contain the raw binary
+  representation of the input in row-major order.
+- file path: A path to a JSON file containing data to be used with every
+  inference request. See the "Real Input Data" section for further details.
+  [`--input-data`](cli.md#--input-datazerorandompath) can be provided multiple
+  times with different file paths to specific multiple JSON files.
+
+For tensors with with `STRING`/`BYTES` datatype, the
+[`--string-length`](cli.md#--string-lengthn) and
+[`--string-data`](cli.md#--string-datastring) options may be used in some cases
+(see [`--help`](cli.md#--help) for full documentation).
+
+For models that support batching you can use the [`-b`](cli.md#-b-n) option to
+indicate the batch size of the requests that Perf Analyzer should send. For
+models with variable-sized inputs you must provide the
+[`--shape`](cli.md#--shapestring) argument so that Perf Analyzer knows what
+shape tensors to use. For example, for a model that has an input called
+`IMAGE` that has shape `[3, N, M]`, where `N` and `M` are variable-size
+dimensions, to tell Perf Analyzer to send batch size 4 requests of shape
+`[3, 224, 224]`:
+
+```
+$ perf_analyzer -m mymodel -b 4 --shape IMAGE:3,224,224
+```
+
+## Real Input Data
+
+The performance of some models is highly dependent on the data used. For such
+cases you can provide data to be used with every inference request made by Perf
+Analyzer in a JSON file. Perf Analyzer will use the provided data in a
+round-robin order when sending inference requests. For sequence models, if a
+sequence length is specified via
+[`--sequence-length`](cli.md#--sequence-lengthn), Perf Analyzer will also loop
+through the provided data in a round-robin order up to the specified sequence
+length (with a percentage variation customizable via
+[`--sequence-length-variation`](cli.md#--sequence-length-variationn)).
+Otherwise, the sequence length will be the number of inputs specified in
+user-provided input data.
+
+Each entry in the `"data"` array must specify all input tensors with the exact
+size expected by the model for a single batch. The following example describes
+data for a model with inputs named, `INPUT0` and `INPUT1`, shape `[4, 4]` and
+data type `INT32`:
+
+```json
+{
+  "data":
+    [
+      {
+        "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+      },
+      {
+        "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+      },
+      {
+        "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+      },
+      {
+        "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+      }
+    ]
+}
+```
+
+Note that the `[4, 4]` tensor has been flattened in a row-major format for the
+inputs. In addition to specifying explicit tensors, you can also provide Base64
+encoded binary data for the tensors. Each data object must list its data in a
+row-major order. Binary data must be in little-endian byte order. The following
+example highlights how this can be achieved:
+
+```json
+{
+  "data":
+    [
+      {
+        "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
+        "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
+      },
+      {
+        "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
+        "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
+      },
+      {
+        "INPUT0": {"b64": "YmFzZTY0IGRlY29kZXI="},
+        "INPUT1": {"b64": "YmFzZTY0IGRlY29kZXI="}
+      }
+    ]
+}
+```
+
+In case of sequence models, multiple data streams can be specified in the JSON
+file. Each sequence will get a data stream of its own and Perf Analyzer will
+ensure the data from each stream is played back to the same correlation ID. The
+below example highlights how to specify data for multiple streams for a sequence
+model with a single input named `INPUT`, shape `[1]` and data type `STRING`:
+
+```json
+{
+  "data":
+    [
+      [
+        {
+          "INPUT": ["1"]
+        },
+        {
+          "INPUT": ["2"]
+        },
+        {
+          "INPUT": ["3"]
+        },
+        {
+          "INPUT": ["4"]
+        }
+      ],
+      [
+        {
+          "INPUT": ["1"]
+        },
+        {
+          "INPUT": ["1"]
+        },
+        {
+          "INPUT": ["1"]
+        }
+      ],
+      [
+        {
+          "INPUT": ["1"]
+        },
+        {
+          "INPUT": ["1"]
+        }
+      ]
+    ]
+}
+```
+
+The above example describes three data streams with lengths 4, 3 and 2
+respectively. Perf Analyzer will hence produce sequences of length 4, 3 and 2 in
+this case.
+
+You can also provide an optional `"shape"` field to the tensors. This is
+especially useful while profiling the models with variable-sized tensors as
+input. Additionally note that when providing the `"shape"` field, tensor
+contents must be provided separately in a "content" field in row-major order.
+The specified shape values will override default input shapes provided as a
+command line option (see [`--shape`](cli.md#--shapestring)) for variable-sized
+inputs. In the absence of a `"shape"` field, the provided defaults will be used.
+There is no need to specify shape as a command line option if all the input data
+provide shape values for variable tensors. Below is an example JSON file for a
+model with a single input `INPUT`, shape `[-1, -1]` and data type `INT32`:
+
+```json
+{
+  "data":
+    [
+      {
+        "INPUT":
+          {
+              "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+              "shape": [2,8]
+          }
+      },
+      {
+        "INPUT":
+          {
+              "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+              "shape": [8,2]
+          }
+      },
+      {
+        "INPUT":
+          {
+              "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+          }
+      },
+      {
+        "INPUT":
+          {
+              "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+              "shape": [4,4]
+          }
+      }
+    ]
+}
+```
+
+The following is the example to provide contents as base64 string with explicit
+shapes:
+
+```json
+{
+  "data":
+    [
+      {
+        "INPUT":
+          {
+            "content": {"b64": "/9j/4AAQSkZ(...)"},
+            "shape": [7964]
+          }
+      },
+      {
+        "INPUT":
+          {
+            "content": {"b64": "/9j/4AAQSkZ(...)"},
+            "shape": [7964]
+          }
+      }
+    ]
+}
+```
+
+Note that for `STRING` type, an element is represented by a 4-byte unsigned
+integer giving the length followed by the actual bytes. The byte array to be
+encoded using base64 must include the 4-byte unsigned integers.
+
+### Output Validation
+
+When real input data is provided, it is optional to request Perf Analyzer to
+validate the inference output for the input data.
+
+Validation output can be specified in the `"validation_data"` field have the
+same format as the `"data"` field for real input. Note that the entries in
+`"validation_data"` must align with `"data"` for proper mapping. The following
+example describes validation data for a model with inputs named `INPUT0` and
+`INPUT1`, outputs named `OUTPUT0` and `OUTPUT1`, all tensors have shape `[4, 4]`
+and data type `INT32`:
+
+```json
+{
+  "data":
+    [
+      {
+        "INPUT0": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        "INPUT1": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+      }
+    ],
+  "validation_data":
+    [
+      {
+        "OUTPUT0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "OUTPUT1": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+      }
+    ]
+}
+```
+
+Besides the above example, the validation outputs can be specified in the same
+variations described in the real input data section.
+
+# Shared Memory
+
+By default Perf Analyzer sends input tensor data and receives output tensor data
+over the network. You can instead instruct Perf Analyzer to use system shared
+memory or CUDA shared memory to communicate tensor data. By using these options
+you can model the performance that you can achieve by using shared memory in
+your application. Use
+[`--shared-memory=system`](cli.md#--shared-memorynonesystemcuda) to use system
+(CPU) shared memory or
+[`--shared-memory=cuda`](cli.md#--shared-memorynonesystemcuda) to use CUDA
+shared memory.
--- a/src/c++/perf_analyzer/docs/install.md
+++ b/src/c++/perf_analyzer/docs/install.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Recommended Installation Method
+
+## Triton SDK Container
+
+The recommended way to "install" Perf Analyzer is to run the pre-built
+executable from within the Triton SDK docker container available on the
+[NVIDIA GPU Cloud Catalog](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver).
+As long as the SDK container has its network exposed to the address and port of
+the inference server, Perf Analyzer will be able to run.
+
+```bash
+export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
+
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+# inside container
+perf_analyzer -m <model>
+```
+
+# Alternative Installation Methods
+
+- [Pip](#pip)
+- [Build from Source](#build-from-source)
+
+## Pip
+
+```bash
+pip install tritonclient
+
+perf_analyzer -m <model>
+```
+
+**Warning**: If any runtime dependencies are missing, Perf Analyzer will produce
+errors showing which ones are missing. You will need to manually install them.
+
+## Build from Source
+
+The Triton SDK container is used for building, so some build and runtime
+dependencies are already installed.
+
+```bash
+export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
+
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+# inside container
+# prep installing newer version of cmake
+apt update && apt install -y gpg wget && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && . /etc/os-release && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null
+
+# install build/runtime dependencies
+apt update && apt install -y cmake-data cmake libcurl4-openssl-dev rapidjson-dev
+
+rm -rf client ; git clone --depth 1 https://github.com/triton-inference-server/client
+
+mkdir client/build ; cd client/build
+
+cmake -DTRITON_ENABLE_PERF_ANALYZER=ON ..
+
+make -j8 cc-clients
+
+perf_analyzer -m <model>
+```
+
+- To enable
+  [CUDA shared memory](input_data.md#shared-memory), add
+  `-DTRITON_ENABLE_GPU=ON` to the `cmake` command.
+- To enable
+  [C API mode](benchmarking.md#benchmarking-triton-directly-via-c-api), add
+  `-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON` to the `cmake` command.
+- To enable [TorchServe backend](benchmarking.md#benchmarking-torchserve), add
+  `-DTRITON_ENABLE_PERF_ANALYZER_TS=ON` to the `cmake` command.
+- To enable
+  [Tensorflow Serving backend](benchmarking.md#benchmarking-tensorflow-serving),
+  add `-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON` to the `cmake` command.
--- a/src/c++/perf_analyzer/docs/measurements_metrics.md
+++ b/src/c++/perf_analyzer/docs/measurements_metrics.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Measurement Modes
+
+Currently, Perf Analyzer has 2 measurement modes.
+
+## Time Windows
+
+When using time windows measurement mode
+([`--measurement-mode=time_windows`](cli.md#--measurement-modetime_windowscount_windows)),
+Perf Analyzer will count how many requests have completed during a window of
+duration `X` (in milliseconds, via
+[`--measurement-interval=X`](cli.md#--measurement-intervaln), default is
+`5000`). This is the default measurement mode.
+
+## Count Windows
+
+When using count windows measurement mode
+([`--measurement-mode=count_windows`](cli.md#--measurement-modetime_windowscount_windows)),
+Perf Analyzer will start the window duration at 1 second and potentially
+dynamically increase it until `X` requests have completed (via
+[`--measurement-request-count=X`](cli.md#--measurement-request-countn), default
+is `50`).
+
+# Metrics
+
+## How Throughput is Calculated
+
+Perf Analyzer calculates throughput to be the total number of requests completed
+during a measurement, divided by the duration of the measurement, in seconds.
+
+## How Latency is Calculated
+
+For each request concurrency level Perf Analyzer reports latency and throughput
+as seen from Perf Analyzer and also the average request latency on the server.
+
+The server latency measures the total time from when the request is received at
+the server until when the response is sent from the server. Because of the HTTP
+and gRPC libraries used to implement the server endpoints, total server latency
+is typically more accurate for HTTP requests as it measures time from the first
+byte received until last byte sent. For both HTTP and gRPC the total server
+latency is broken-down into the following components:
+
+- _queue_: The average time spent in the inference schedule queue by a request
+  waiting for an instance of the model to become available.
+- _compute_: The average time spent performing the actual inference, including
+  any time needed to copy data to/from the GPU.
+- _overhead_: The average time spent in the endpoint that cannot be correctly
+  captured in the send/receive time with the way the gRPC and HTTP libraries are
+  structured.
+
+The client latency time is broken-down further for HTTP and gRPC as follows:
+
+- HTTP: _send/recv_ indicates the time on the client spent sending the request
+  and receiving the response. _response wait_ indicates time waiting for the
+  response from the server.
+- gRPC: _(un)marshal request/response_ indicates the time spent marshalling the
+  request data into the gRPC protobuf and unmarshalling the response data from
+  the gRPC protobuf. _response wait_ indicates time writing the gRPC request to
+  the network, waiting for the response, and reading the gRPC response from the
+  network.
+
+Use the verbose ([`-v`](cli.md#-v)) option see more output, including the
+stabilization passes run for each request concurrency level or request rate.
+
+# Reports
+
+## Visualizing Latency vs. Throughput
+
+Perf Analyzer provides the [`-f`](cli.md#-f-path) option to generate a file
+containing CSV output of the results.
+
+```
+$ perf_analyzer -m inception_graphdef --concurrency-range 1:4 -f perf.csv
+...
+$ cat perf.csv
+Concurrency,Inferences/Second,Client Send,Network+Server Send/Recv,Server Queue,Server Compute Input,Server Compute Infer,Server Compute Output,Client Recv,p50 latency,p90 latency,p95 latency,p99 latency
+1,69.2,225,2148,64,206,11781,19,0,13891,18795,19753,21018
+3,84.2,237,1768,21673,209,11742,17,0,35398,43984,47085,51701
+4,84.2,279,1604,33669,233,11731,18,1,47045,56545,59225,64886
+2,87.2,235,1973,9151,190,11346,17,0,21874,28557,29768,34766
+```
+
+NOTE: The rows in the CSV file are sorted in an increasing order of throughput
+(Inferences/Second).
+
+You can import the CSV file into a spreadsheet to help visualize the latency vs
+inferences/second tradeoff as well as see some components of the latency. Follow
+these steps:
+
+- Open
+  [this spreadsheet](https://docs.google.com/spreadsheets/d/1S8h0bWBBElHUoLd2SOvQPzZzRiQ55xjyqodm_9ireiw)
+- Make a copy from the File menu "Make a copy..."
+- Open the copy
+- Select the A1 cell on the "Raw Data" tab
+- From the File menu select "Import..."
+- Select "Upload" and upload the file
+- Select "Replace data at selected cell" and then select the "Import data"
+  button
+
+## Server-side Prometheus metrics
+
+Perf Analyzer can collect
+[server-side metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#gpu-metrics),
+such as GPU utilization and GPU power usage. To enable the collection of these
+metrics, use the [`--collect-metrics`](cli.md#--collect-metrics) option.
+
+By default, Perf Analyzer queries the metrics endpoint at the URL
+`localhost:8002/metrics`. If the metrics are accessible at a different url, use
+the [`--metrics-url=<url>`](cli.md#--metrics-urlurl) option to specify that.
+
+By default, Perf Analyzer queries the metrics endpoint every 1000 milliseconds.
+To use a different querying interval, use the
+[`--metrics-interval=<n>`](cli.md#--metrics-intervaln) option (specify in
+milliseconds).
+
+Because Perf Analyzer can collect the server-side metrics multiple times per
+run, these metrics are aggregated in specific ways to produce one final number
+per searched concurrency or request rate. Here are how the metrics are
+aggregated:
+
+| Metric | Aggregation |
+| - | - |
+| GPU Utilization | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. |
+| GPU Power Usage | Averaged from each collection taken during stable passes. We want a number representative of all stable passes. |
+| GPU Used Memory | Maximum from all collections taken during a stable pass. Users are typically curious what the peak memory usage is for determining model/hardware viability. |
+| GPU Total Memory | First from any collection taken during a stable pass. All of the collections should produce the same value for total memory available on the GPU. |
+
+Note that all metrics are per-GPU in the case of multi-GPU systems.
+
+To output these server-side metrics to a CSV file, use the
+[`-f <path>`](cli.md#-f-path) and [`--verbose-csv`](cli.md#--verbose-csv)
+options. The output CSV will contain one column per metric. The value of each
+column will be a `key:value` pair (`GPU UUID:metric value`). Each `key:value`
+pair will be delimited by a semicolon (`;`) to indicate metric values for each
+GPU accessible by the server. There is a trailing semicolon. See below:
+
+`<gpu-uuid-0>:<metric-value>;<gpu-uuid-1>:<metric-value>;...;`
+
+Here is a simplified CSV output:
+
+```
+$ perf_analyzer -m resnet50_libtorch --collect-metrics -f output.csv --verbose-csv
+$ cat output.csv
+Concurrency,...,Avg GPU Utilization,Avg GPU Power Usage,Max GPU Memory Usage,Total GPU Memory
+1,...,gpu_uuid_0:0.33;gpu_uuid_1:0.5;,gpu_uuid_0:55.3;gpu_uuid_1:56.9;,gpu_uuid_0:10000;gpu_uuid_1:11000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
+2,...,gpu_uuid_0:0.25;gpu_uuid_1:0.6;,gpu_uuid_0:25.6;gpu_uuid_1:77.2;,gpu_uuid_0:11000;gpu_uuid_1:17000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
+3,...,gpu_uuid_0:0.87;gpu_uuid_1:0.9;,gpu_uuid_0:87.1;gpu_uuid_1:71.7;,gpu_uuid_0:15000;gpu_uuid_1:22000;,gpu_uuid_0:50000;gpu_uuid_1:75000;,
+```
+
+## Communication Protocol
+
+By default, Perf Analyzer uses HTTP to communicate with Triton. The gRPC
+protocol can be specified with the [`-i [http|grpc]`](cli.md#-i-httpgrpc)
+option. If gRPC is selected the [`--streaming`](cli.md#--streaming) option can
+also be specified for gRPC streaming.
+
+### SSL/TLS Support
+
+Perf Analyzer can be used to benchmark Triton service behind SSL/TLS-enabled
+endpoints. These options can help in establishing secure connection with the
+endpoint and profile the server.
+
+For gRPC, see the following options:
+
+- [`--ssl-grpc-use-ssl`](cli.md#--ssl-grpc-use-ssl)
+- [`--ssl-grpc-root-certifications-file=<path>`](cli.md#--ssl-grpc-root-certifications-filepath)
+- [`--ssl-grpc-private-key-file=<path>`](cli.md#--ssl-grpc-private-key-filepath)
+- [`--ssl-grpc-certificate-chain-file=<path>`](cli.md#--ssl-grpc-certificate-chain-filepath)
+
+More details here:
+https://grpc.github.io/grpc/cpp/structgrpc_1_1_ssl_credentials_options.html
+
+The
+[inference protocol gRPC SSL/TLS section](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#ssltls)
+describes server-side options to configure SSL/TLS in Triton's gRPC endpoint.
+
+For HTTPS, the following options are exposed:
+
+- [`--ssl-https-verify-peer`](cli.md#--ssl-https-verify-peer01)
+- [`--ssl-https-verify-host`](cli.md#--ssl-https-verify-host012)
+- [`--ssl-https-ca-certificates-file`](cli.md#--ssl-https-ca-certificates-filepath)
+- [`--ssl-https-client-certificate-file`](cli.md#--ssl-https-client-certificate-filepath)
+- [`--ssl-https-client-certificate-type`](cli.md#--ssl-https-client-certificate-typepemder)
+- [`--ssl-https-private-key-file`](cli.md#--ssl-https-private-key-filepath)
+- [`--ssl-https-private-key-type`](cli.md#--ssl-https-private-key-typepemder)
+
+See [`--help`](cli.md#--help) for full documentation.
+
+Unlike gRPC, Triton's HTTP server endpoint can not be configured with SSL/TLS
+support.
+
+Note: Just providing these `--ssl-http-*` options to Perf Analyzer does not
+ensure that SSL/TLS is used in communication. If SSL/TLS is not enabled on the
+service endpoint, these options have no effect. The intent of exposing these
+options to a user of Perf Analyzer is to allow them to configure Perf Analyzer
+to benchmark a Triton service behind SSL/TLS-enabled endpoints. In other words,
+if Triton is running behind a HTTPS server proxy, then these options would allow
+Perf Analyzer to profile Triton via exposed HTTPS proxy.
--- a/src/c++/perf_analyzer/docs/quick_start.md
+++ b/src/c++/perf_analyzer/docs/quick_start.md
+<!--
+Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+# Quick Start
+
+The steps below will guide you on how to start using Perf Analyzer.
+
+### Step 1: Start Triton Container
+
+```bash
+export RELEASE=<yy.mm> # e.g. to use the release from the end of February of 2023, do `export RELEASE=23.02`
+
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3
+```
+
+### Step 2: Download `simple` Model
+
+```bash
+# inside triton container
+git clone --depth 1 https://github.com/triton-inference-server/server
+
+mkdir model_repository ; cp -r server/docs/examples/model_repository/simple model_repository
+```
+
+### Step 3: Start Triton Server
+
+```bash
+# inside triton container
+tritonserver --model-repository $(pwd)/model_repository &> server.log &
+
+# confirm server is ready, look for 'HTTP/1.1 200 OK'
+curl -v localhost:8000/v2/health/ready
+
+# detach (CTRL-p CTRL-q)
+```
+
+### Step 4: Start Triton SDK Container
+
+```bash
+docker pull nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+
+docker run --gpus all --rm -it --net host nvcr.io/nvidia/tritonserver:${RELEASE}-py3-sdk
+```
+
+### Step 5: Run Perf Analyzer
+
+```bash
+# inside sdk container
+perf_analyzer -m simple
+```
+
+### Step 6: Observe and Analyze Output
+
+```
+$ perf_analyzer -m simple
+*** Measurement Settings ***
+  Batch size: 1
+  Service Kind: Triton
+  Using "time_windows" mode for stabilization
+  Measurement window: 5000 msec
+  Using synchronous calls for inference
+  Stabilizing using average latency
+
+Request concurrency: 1
+  Client:
+    Request count: 25348
+    Throughput: 1407.84 infer/sec
+    Avg latency: 708 usec (standard deviation 663 usec)
+    p50 latency: 690 usec
+    p90 latency: 881 usec
+    p95 latency: 926 usec
+    p99 latency: 1031 usec
+    Avg HTTP time: 700 usec (send/recv 102 usec + response wait 598 usec)
+  Server:
+    Inference count: 25348
+    Execution count: 25348
+    Successful request count: 25348
+    Avg request latency: 382 usec (overhead 41 usec + queue 41 usec + compute input 26 usec + compute infer 257 usec + compute output 16 usec)
+
+Inferences/Second vs. Client Average Batch Latency
+Concurrency: 1, throughput: 1407.84 infer/sec, latency 708 usec
+```
+
+We can see from the output that the model was able to complete approximately
+1407.84 inferences per second, with an average latency of 708 microseconds per
+inference request. Concurrency of 1 meant that Perf Analyzer attempted to always
+have 1 outgoing request at all times.
--- a/src/c++/perf_analyzer/doctest.h
+++ b/src/c++/perf_analyzer/doctest.h
--- a/src/c++/perf_analyzer/fifo_ctx_id_tracker.h
+++ b/src/c++/perf_analyzer/fifo_ctx_id_tracker.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "base_queue_ctx_id_tracker.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Context ID Tracker that reuses IDs in a roughly round-robin manner using a
+// FIFO
+//
+class FifoCtxIdTracker : public BaseQueueCtxIdTracker {
+ public:
+  FifoCtxIdTracker() = default;
+  void Reset(size_t count) override
+  {
+    Clear();
+
+    for (size_t i = 0; i < count; ++i) {
+      free_ctx_ids_.push(i);
+    }
+  }
+};
+
+}};  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/ictx_id_tracker.h
+++ b/src/c++/perf_analyzer/ictx_id_tracker.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+
+namespace triton { namespace perfanalyzer {
+
+/// Interface for object that tracks context IDs
+///
+class ICtxIdTracker {
+ public:
+  // Reset the tracker using the provided input count
+  //
+  virtual void Reset(size_t count) = 0;
+
+  // Restore the given ID into the tracker
+  //
+  virtual void Restore(size_t id) = 0;
+
+  // Pick and return a Ctx ID
+  //
+  virtual size_t Get() = 0;
+
+  // Returns true if there are Ctx IDs available to Get.
+  virtual bool IsAvailable() = 0;
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/idle_timer.h
+++ b/src/c++/perf_analyzer/idle_timer.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+#include <chrono>
+#include <mutex>
+#include <stdexcept>
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class TestLoadManager;
+#endif
+
+
+/// Class to track idle periods of time
+///
+class IdleTimer {
+ public:
+  void Start()
+  {
+    std::lock_guard<std::mutex> lk(mtx_);
+    StartImpl();
+  }
+
+  void Stop()
+  {
+    std::lock_guard<std::mutex> lk(mtx_);
+    StopImpl();
+  }
+
+  /// Reset the time counter, and restart the timer if it is active
+  ///
+  void Reset()
+  {
+    Restart();
+    idle_ns_ = 0;
+  }
+
+  /// Returns the number of nanoseconds this timer has counted as being idle
+  /// If the timer was already active, then it will first stop (and count the
+  /// pending time), and then start back up
+  ///
+  uint64_t GetIdleTime()
+  {
+    Restart();
+    return idle_ns_;
+  }
+
+ private:
+  std::mutex mtx_;
+  uint64_t idle_ns_{0};
+  bool is_idle_{false};
+  std::chrono::_V2::steady_clock::time_point start_time_;
+
+  void Restart()
+  {
+    std::lock_guard<std::mutex> lk(mtx_);
+    if (is_idle_) {
+      StopImpl();
+      StartImpl();
+    }
+  }
+
+  void StartImpl()
+  {
+    if (is_idle_) {
+      throw std::runtime_error("Can't start a timer that is already active\n");
+    }
+
+    is_idle_ = true;
+    start_time_ = std::chrono::steady_clock::now();
+  }
+
+  void StopImpl()
+  {
+    if (!is_idle_) {
+      throw std::runtime_error("Can't stop a timer that isn't active\n");
+    }
+
+    is_idle_ = false;
+    auto end = std::chrono::steady_clock::now();
+    auto duration = end - start_time_;
+    idle_ns_ += duration.count();
+  }
+
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend TestLoadManager;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/iinfer_data_manager.h
+++ b/src/c++/perf_analyzer/iinfer_data_manager.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "client_backend/client_backend.h"
+#include "constants.h"
+#include "data_loader.h"
+#include "infer_data.h"
+#include "model_parser.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Interface for classes that manage infer data preparation for inference
+///
+class IInferDataManager {
+ public:
+  /// Initialize this object. Must be called before any other functions
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error Init() = 0;
+
+  /// Populate the target InferData object with input and output objects
+  /// according to the model's shape
+  /// \param infer_data The target InferData object.
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error InitInferData(InferData& infer_data) = 0;
+
+  /// Updates the input and expected output data in the target infer_data for an
+  /// inference request
+  /// \param thread_id The ID of the calling thread
+  /// \param stream_index The data stream to use for next data
+  /// \param step_index The step index to use for next data
+  /// \param infer_data The target InferData object
+  /// \return cb::Error object indicating success or failure.
+  virtual cb::Error UpdateInferData(
+      size_t thread_id, int stream_index, int step_index,
+      InferData& infer_data) = 0;
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_context.cc
+++ b/src/c++/perf_analyzer/infer_context.cc
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "infer_context.h"
+
+namespace triton { namespace perfanalyzer {
+
+void
+InferContext::Init()
+{
+  thread_stat_->status_ = infer_data_manager_->InitInferData(infer_data_);
+  if (!thread_stat_->status_.IsOk()) {
+    return;
+  }
+
+  if (streaming_) {
+    // Decoupled models should not collect client side statistics
+    thread_stat_->status_ = infer_backend_->StartStream(
+        async_callback_func_, (!parser_->IsDecoupled()));
+    if (!thread_stat_->status_.IsOk()) {
+      return;
+    }
+  }
+}
+
+void
+InferContext::SendInferRequest(bool delayed)
+{
+  // Update the inputs if required
+  if (using_json_data_) {
+    UpdateJsonData();
+  }
+  SendRequest(request_id_++, delayed);
+}
+
+void
+InferContext::SendSequenceInferRequest(uint32_t seq_stat_index, bool delayed)
+{
+  // Need lock to protect the order of dispatch across worker threads.
+  // This also helps in reporting the realistic latencies.
+  std::lock_guard<std::mutex> guard(
+      sequence_manager_->GetMutex(seq_stat_index));
+  if (!early_exit && execute_) {
+    sequence_manager_->SetInferSequenceOptions(
+        seq_stat_index, infer_data_.options_);
+
+    // Update the inputs if required
+    if (using_json_data_) {
+      UpdateSeqJsonData(seq_stat_index);
+    }
+
+    sequence_manager_->DecrementRemainingQueries(seq_stat_index);
+
+    SendRequest(
+        request_id_++, delayed,
+        sequence_manager_->GetSequenceID(seq_stat_index));
+  }
+}
+
+void
+InferContext::CompleteOngoingSequence(uint32_t seq_stat_index)
+{
+  std::lock_guard<std::mutex> guard(
+      sequence_manager_->GetMutex(seq_stat_index));
+
+  if (sequence_manager_->GetRemainingQueries(seq_stat_index) != 0) {
+    sequence_manager_->SetRemainingQueries(seq_stat_index, 1);
+    sequence_manager_->SetInferSequenceOptions(
+        seq_stat_index, infer_data_.options_);
+
+    if (using_json_data_) {
+      UpdateSeqJsonData(seq_stat_index);
+    }
+    sequence_manager_->DecrementRemainingQueries(seq_stat_index);
+
+    bool is_delayed = false;
+    SendRequest(
+        request_id_++, is_delayed,
+        sequence_manager_->GetSequenceID(seq_stat_index));
+  }
+}
+
+void
+InferContext::SendRequest(
+    const uint64_t request_id, const bool delayed, const uint64_t sequence_id)
+{
+  if (!thread_stat_->status_.IsOk()) {
+    return;
+  }
+
+  thread_stat_->num_sent_requests_++;
+  if (async_) {
+    infer_data_.options_->request_id_ = std::to_string(request_id);
+    {
+      std::lock_guard<std::mutex> lock(thread_stat_->mu_);
+      auto it = async_req_map_
+                    .emplace(infer_data_.options_->request_id_, RequestRecord())
+                    .first;
+      it->second.start_time_ = std::chrono::system_clock::now();
+      it->second.sequence_end_ = infer_data_.options_->sequence_end_;
+      it->second.delayed_ = delayed;
+      it->second.sequence_id_ = sequence_id;
+    }
+
+    thread_stat_->idle_timer.Start();
+    if (streaming_) {
+      thread_stat_->status_ = infer_backend_->AsyncStreamInfer(
+          *(infer_data_.options_), infer_data_.valid_inputs_,
+          infer_data_.outputs_);
+    } else {
+      thread_stat_->status_ = infer_backend_->AsyncInfer(
+          async_callback_func_, *(infer_data_.options_),
+          infer_data_.valid_inputs_, infer_data_.outputs_);
+    }
+    thread_stat_->idle_timer.Stop();
+
+    total_ongoing_requests_++;
+  } else {
+    std::chrono::time_point<std::chrono::system_clock> start_time_sync,
+        end_time_sync;
+    thread_stat_->idle_timer.Start();
+    start_time_sync = std::chrono::system_clock::now();
+    cb::InferResult* results = nullptr;
+    thread_stat_->status_ = infer_backend_->Infer(
+        &results, *(infer_data_.options_), infer_data_.valid_inputs_,
+        infer_data_.outputs_);
+    thread_stat_->idle_timer.Stop();
+    if (results != nullptr) {
+      if (thread_stat_->status_.IsOk()) {
+        thread_stat_->status_ = ValidateOutputs(results);
+      }
+      delete results;
+    }
+    if (!thread_stat_->status_.IsOk()) {
+      return;
+    }
+    end_time_sync = std::chrono::system_clock::now();
+    std::vector<std::chrono::time_point<std::chrono::system_clock>>
+        end_time_syncs{end_time_sync};
+    {
+      // Add the request record to thread request records vector with proper
+      // locking
+      std::lock_guard<std::mutex> lock(thread_stat_->mu_);
+      auto total = end_time_sync - start_time_sync;
+      thread_stat_->request_records_.emplace_back(RequestRecord(
+          start_time_sync, std::move(end_time_syncs),
+          infer_data_.options_->sequence_end_, delayed, sequence_id, false));
+      thread_stat_->status_ =
+          infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
+      if (!thread_stat_->status_.IsOk()) {
+        return;
+      }
+    }
+  }
+}
+
+
+void
+InferContext::UpdateJsonData()
+{
+  int step_id = (data_step_id_ * batch_size_) % data_loader_->GetTotalSteps(0);
+  data_step_id_ += GetNumActiveThreads();
+  thread_stat_->status_ =
+      infer_data_manager_->UpdateInferData(thread_id_, 0, step_id, infer_data_);
+}
+
+void
+InferContext::UpdateSeqJsonData(size_t seq_stat_index)
+{
+  const size_t sequence_length{
+      sequence_manager_->GetSequenceLength(seq_stat_index)};
+  const size_t remaining_queries{
+      sequence_manager_->GetRemainingQueries(seq_stat_index)};
+  const uint64_t data_stream_id{
+      sequence_manager_->GetDataStreamID(seq_stat_index)};
+  const size_t total_steps{data_loader_->GetTotalSteps(data_stream_id)};
+  int step_id = (sequence_length - remaining_queries) % total_steps;
+  thread_stat_->status_ = infer_data_manager_->UpdateInferData(
+      thread_id_, data_stream_id, step_id, infer_data_);
+}
+
+cb::Error
+InferContext::ValidateOutputs(const cb::InferResult* result_ptr)
+{
+  // Validate output if set
+  if (!infer_data_.expected_outputs_.empty()) {
+    for (size_t i = 0; i < infer_data_.outputs_.size(); ++i) {
+      const uint8_t* buf = nullptr;
+      size_t byte_size = 0;
+      result_ptr->RawData(infer_data_.outputs_[i]->Name(), &buf, &byte_size);
+      for (const auto& expected : infer_data_.expected_outputs_[i]) {
+        if (!expected.is_valid) {
+          return cb::Error(
+              "Expected output can't be invalid", pa::GENERIC_ERROR);
+        }
+        if (byte_size < expected.batch1_size) {
+          return cb::Error(
+              "Output size doesn't match expected size", pa::GENERIC_ERROR);
+        } else if (memcmp(buf, expected.data_ptr, expected.batch1_size) != 0) {
+          return cb::Error(
+              "Output doesn't match expected output", pa::GENERIC_ERROR);
+        } else {
+          buf += expected.batch1_size;
+          byte_size -= expected.batch1_size;
+        }
+      }
+      if (byte_size != 0) {
+        return cb::Error(
+            "Output size doesn't match expected size", pa::GENERIC_ERROR);
+      }
+    }
+  }
+  return cb::Error::Success;
+}
+
+void
+InferContext::AsyncCallbackFuncImpl(cb::InferResult* result)
+{
+  std::shared_ptr<cb::InferResult> result_ptr(result);
+  bool is_final_response{true};
+  if (thread_stat_->cb_status_.IsOk()) {
+    // Add the request record to thread request records vector with
+    // proper locking
+    std::lock_guard<std::mutex> lock(thread_stat_->mu_);
+    thread_stat_->cb_status_ = result_ptr->RequestStatus();
+    if (thread_stat_->cb_status_.IsOk()) {
+      std::string request_id;
+      thread_stat_->cb_status_ = result_ptr->Id(&request_id);
+      const auto& it = async_req_map_.find(request_id);
+      if (it != async_req_map_.end()) {
+        bool is_null_response{false};
+        thread_stat_->cb_status_ =
+            result_ptr->IsNullResponse(&is_null_response);
+        if (thread_stat_->cb_status_.IsOk() == false) {
+          return;
+        }
+        it->second.response_times_.push_back(std::chrono::system_clock::now());
+        if (is_null_response == true) {
+          it->second.has_null_last_response_ = true;
+        }
+        thread_stat_->cb_status_ =
+            result_ptr->IsFinalResponse(&is_final_response);
+        if (thread_stat_->cb_status_.IsOk() == false) {
+          return;
+        }
+        if (is_final_response) {
+          thread_stat_->request_records_.emplace_back(
+              it->second.start_time_, it->second.response_times_,
+              it->second.sequence_end_, it->second.delayed_,
+              it->second.sequence_id_, it->second.has_null_last_response_);
+          infer_backend_->ClientInferStat(&(thread_stat_->contexts_stat_[id_]));
+          thread_stat_->cb_status_ = ValidateOutputs(result);
+          async_req_map_.erase(request_id);
+        }
+      }
+    }
+  }
+
+  if (is_final_response) {
+    total_ongoing_requests_--;
+
+    if (async_callback_finalize_func_ != nullptr) {
+      async_callback_finalize_func_(id_);
+    }
+  }
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_context.h
+++ b/src/c++/perf_analyzer/infer_context.h
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "data_loader.h"
+#include "idle_timer.h"
+#include "iinfer_data_manager.h"
+#include "infer_data.h"
+#include "perf_utils.h"
+#include "request_record.h"
+#include "sequence_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Holds the running status of the thread.
+struct ThreadStat {
+  ThreadStat() {}
+
+  // The status of the worker thread
+  cb::Error status_;
+  // The status of the callback thread for async requests
+  cb::Error cb_status_;
+  // TODO REFACTOR TMA-1046 -- This should be in the InferContext class
+  // The statistics of the InferContext
+  std::vector<cb::InferStat> contexts_stat_;
+
+  // Tracks the amount of time this thread spent sleeping or waiting
+  IdleTimer idle_timer;
+
+  // A vector of request records
+  std::vector<RequestRecord> request_records_;
+  // A lock to protect thread data
+  std::mutex mu_;
+  // The number of sent requests by this thread.
+  std::atomic<size_t> num_sent_requests_{0};
+};
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockInferContext;
+#endif
+
+/// Sends inference requests to the server
+class InferContext {
+ public:
+  InferContext(
+      const size_t thread_id, const uint32_t id, const bool async,
+      const bool streaming, const bool on_sequence_model,
+      const bool using_json_data, const int32_t batch_size,
+      std::shared_ptr<ThreadStat> thread_stat,
+      std::shared_ptr<DataLoader> data_loader,
+      std::shared_ptr<ModelParser> parser,
+      std::shared_ptr<cb::ClientBackendFactory> factory, const bool& execute,
+      const std::shared_ptr<IInferDataManager>& infer_data_manager,
+      std::shared_ptr<SequenceManager> sequence_manager)
+      : thread_id_(thread_id), id_(id), async_(async), streaming_(streaming),
+        on_sequence_model_(on_sequence_model),
+        using_json_data_(using_json_data), batch_size_(batch_size),
+        thread_stat_(thread_stat), data_loader_(data_loader), parser_(parser),
+        factory_(factory), data_step_id_(id), execute_(execute),
+        infer_data_manager_(infer_data_manager),
+        sequence_manager_(sequence_manager)
+  {
+    thread_stat_->status_ = factory_->CreateClientBackend(&infer_backend_);
+    infer_data_.options_.reset(new cb::InferOptions(parser_->ModelName()));
+    infer_data_.options_->model_version_ = parser_->ModelVersion();
+    infer_data_.options_->model_signature_name_ = parser_->ModelSignatureName();
+
+    thread_stat_->contexts_stat_.emplace_back();
+  }
+
+  InferContext(InferContext&&) = delete;
+  InferContext(const InferContext&) = delete;
+
+  // Initialize the context. Must be done before any inferences are sent
+  void Init();
+
+  // Send a single inference request to the server
+  void SendInferRequest(bool delayed = false);
+
+  // Send a single sequence inference request to the server
+  void SendSequenceInferRequest(uint32_t seq_index, bool delayed = false);
+
+  // Finish the active sequence at the given seq_stat_index
+  void CompleteOngoingSequence(uint32_t seq_stat_index);
+
+  // Returns the total number of async requests that have been sent by this
+  // object and have not returned
+  uint GetNumOngoingRequests() { return total_ongoing_requests_; }
+
+  // Register a function that will get called after every async request returns
+  void RegisterAsyncCallbackFinalize(std::function<void(uint32_t)> callback)
+  {
+    async_callback_finalize_func_ = callback;
+  }
+
+  // TODO REFACTOR TMA-1043 this should be in memory class
+  void SetNumActiveThreads(size_t num_threads)
+  {
+    num_active_threads_ = num_threads;
+  }
+
+ protected:
+  /// A helper function to issue inference request to the server.
+  /// \param request_id The unique id to be associated with the request.
+  /// \param delayed Whether the request fell behind its scheduled time.
+  /// \param sequence_id Sequence ID of the request. Note that the default of
+  /// `0` means the request is not a sequence.
+  virtual void SendRequest(
+      const uint64_t request_id, const bool delayed,
+      const uint64_t sequence_id = 0);
+
+  /// Update inputs based on custom json data
+  void UpdateJsonData();
+
+  /// Update inputs based on custom json data for the given sequence
+  void UpdateSeqJsonData(size_t seq_stat_index);
+
+  cb::Error ValidateOutputs(const cb::InferResult* result_ptr);
+
+  // Callback function for handling asynchronous requests
+  void AsyncCallbackFuncImpl(cb::InferResult* result);
+
+  bool async_{false};
+  bool streaming_{false};
+  const bool on_sequence_model_{false};
+  bool using_json_data_{false};
+  const int32_t batch_size_{0};
+
+  std::shared_ptr<ThreadStat> thread_stat_;
+  std::shared_ptr<DataLoader> data_loader_;
+  std::shared_ptr<ModelParser> parser_;
+  std::shared_ptr<cb::ClientBackendFactory> factory_;
+  std::shared_ptr<IInferDataManager> infer_data_manager_;
+
+  uint64_t request_id_ = 0;
+  std::map<std::string, RequestRecord> async_req_map_;
+  std::atomic<uint> total_ongoing_requests_{0};
+  size_t data_step_id_;
+
+  // Function pointer to the async callback function implementation
+  std::function<void(cb::InferResult*)> async_callback_func_ = std::bind(
+      &InferContext::AsyncCallbackFuncImpl, this, std::placeholders::_1);
+
+  // Function pointer to registered async callbacks
+  std::function<void(uint32_t)> async_callback_finalize_func_ = nullptr;
+
+ private:
+  const uint32_t id_{0};
+  const size_t thread_id_{0};
+
+  size_t GetNumActiveThreads() { return num_active_threads_; }
+
+  size_t num_active_threads_{0};
+
+  // The backend to communicate with the server
+  std::unique_ptr<cb::ClientBackend> infer_backend_;
+  InferData infer_data_;
+
+  // FIXME: update build to use C++17 instead of C++14. This is a workaround
+  // since C++14 doesn't have std::optional, but C++17 does.
+  const bool execute_placeholder_{false};
+  std::reference_wrapper<const bool> execute_{execute_placeholder_};
+
+  std::shared_ptr<SequenceManager> sequence_manager_{nullptr};
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockInferContext;
+
+ public:
+  InferContext() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/infer_data.h
+++ b/src/c++/perf_analyzer/infer_data.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "client_backend/client_backend.h"
+#include "tensor_data.h"
+
+namespace triton { namespace perfanalyzer {
+
+/// Holds all the data needed to send an inference request
+struct InferData {
+  ~InferData()
+  {
+    for (const auto input : inputs_) {
+      delete input;
+    }
+    for (const auto output : outputs_) {
+      delete output;
+    }
+  }
+
+  // The vector of pointers to InferInput objects for all possible inputs,
+  // potentially including optional inputs with no provided data.
+  std::vector<cb::InferInput*> inputs_;
+  // The vector of pointers to InferInput objects to be
+  // used for inference request.
+  std::vector<cb::InferInput*> valid_inputs_;
+  // The vector of pointers to InferRequestedOutput objects
+  // to be used with the inference request.
+  std::vector<const cb::InferRequestedOutput*> outputs_;
+  // If not empty, the expected output data in the same order as 'outputs_'
+  // The outer vector is per-output. The inner vector is for batching of each
+  // output
+  std::vector<std::vector<TensorData>> expected_outputs_;
+  // The InferOptions object holding the details of the
+  // inference.
+  std::unique_ptr<cb::InferOptions> options_;
+};
+
+
+}}  // namespace triton::perfanalyzer