Initial commit

c68e1835 · lijian6 · c68e1835 · c68e1835 · c68e1835 · c68e1835
Commit c68e1835 authored Sep 18, 2023 by lijian6
20 changed files
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/scoped_defer.cc
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/scoped_defer.cc
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "scoped_defer.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+ScopedDefer::ScopedDefer(std::function<void()> task)
+{
+  task_ = task;
+  done_ = false;
+}
+
+void
+ScopedDefer::Complete()
+{
+  if (!done_) {
+    task_();
+    done_ = true;
+  }
+}
+
+ScopedDefer::~ScopedDefer()
+{
+  if (!done_) {
+    task_();
+  }
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/scoped_defer.h
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/scoped_defer.h
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+#include <functional>
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+class ScopedDefer {
+ public:
+  ScopedDefer(std::function<void()> task);
+  ~ScopedDefer();
+  void Complete();
+
+ private:
+  std::function<void()> task_;
+  bool done_;
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/shared_library.cc
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/shared_library.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include "shared_library.h"
+
+#include <dlfcn.h>
+
+#include <iostream>
+
+/// FIXME: Duplication of server/src/core/shared_library.cc
+/// Separate shared_library to common library and delete this
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+Error
+OpenLibraryHandle(const std::string& path, void** handle)
+{
+  *handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+  if (*handle == nullptr) {
+    return Error("unable to load backend library: " + std::string(dlerror()));
+  }
+  return Error::Success;
+}
+
+Error
+CloseLibraryHandle(void* handle)
+{
+  if (handle != nullptr) {
+    if (dlclose(handle) != 0) {
+      return Error(
+          "unable to unload backend library: " + std::string(dlerror()));
+    }
+  }
+  return Error::Success;
+}
+
+Error
+GetEntrypoint(
+    void* handle, const std::string& name, const bool optional, void** befn)
+{
+  *befn = nullptr;
+  dlerror();
+  void* fn = dlsym(handle, name.c_str());
+  const char* dlsym_error = dlerror();
+  if (dlsym_error != nullptr) {
+    if (optional) {
+      return Error::Success;
+    }
+
+    std::string errstr(dlsym_error);  // need copy as dlclose overwrites
+    return Error(
+        "unable to find required entrypoint '" + name +
+        "' in backend library: " + errstr);
+  }
+
+  if (fn == nullptr) {
+    if (optional) {
+      return Error::Success;
+    }
+
+    return Error(
+        "unable to find required entrypoint '" + name + "' in backend library");
+  }
+
+  *befn = fn;
+  return Error::Success;
+}
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/shared_library.h
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/shared_library.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+
+#include "../client_backend.h"
+/// FIXME: Duplication of server/src/core/shared_library.h
+/// Separate shared_library to common library and delete this
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+Error OpenLibraryHandle(const std::string& path, void** handle);
+
+Error CloseLibraryHandle(void* handle);
+
+Error GetEntrypoint(
+    void* handle, const std::string& name, const bool optional, void** befn);
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/shared_memory_manager.cc
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/shared_memory_manager.cc
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "shared_memory_manager.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "common.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+SharedMemoryManager::~SharedMemoryManager()
+{
+  UnregisterAll(TRITONSERVER_MEMORY_CPU);
+  UnregisterAll(TRITONSERVER_MEMORY_GPU);
+}
+
+#ifdef TRITON_ENABLE_GPU
+Error
+SharedMemoryManager::RegisterCUDAMemory(
+    const std::string& name, void* dev_ptr, const size_t byte_size,
+    const int device_id)
+{
+  // Serialize all operations that write/read current shared memory regions
+  std::lock_guard<std::mutex> lock(mu_);
+
+  // If name is already in shared_memory_map_ then return error saying already
+  // registered
+  if (shared_memory_map_.find(name) != shared_memory_map_.end()) {
+    return Error(
+        std::string("shared memory region '" + name + "' already in manager"));
+  }
+
+  shared_memory_map_.insert(std::make_pair(
+      name, std::unique_ptr<MemoryInfo>(new MemoryInfo(
+                name, 0 /* offset */, byte_size, dev_ptr,
+                TRITONSERVER_MEMORY_GPU, device_id))));
+  return Error::Success;
+}
+#endif  // TRITON_ENABLE_GPU
+
+Error
+SharedMemoryManager::RegisterSystemMemory(
+    const std::string& name, void* ptr, const size_t byte_size)
+{
+  // Serialize all operations that write/read current shared memory regions
+  std::lock_guard<std::mutex> lock(mu_);
+
+  // If name is already in shared_memory_map_ then return error saying already
+  // registered
+  if (shared_memory_map_.find(name) != shared_memory_map_.end()) {
+    return Error("shared memory region '" + name + "' already in manager");
+  }
+
+  shared_memory_map_.insert(std::make_pair(
+      name, std::make_unique<MemoryInfo>(
+                name, 0 /* offset */, byte_size, ptr, TRITONSERVER_MEMORY_CPU,
+                0 /* device id */)));
+
+  return Error::Success;
+}
+
+Error
+SharedMemoryManager::GetMemoryInfo(
+    const std::string& name, size_t offset, void** shm_mapped_addr,
+    TRITONSERVER_MemoryType* memory_type, int64_t* device_id)
+{
+  // protect shared_memory_map_ from concurrent access
+  std::lock_guard<std::mutex> lock(mu_);
+
+  auto it = shared_memory_map_.find(name);
+  if (it == shared_memory_map_.end()) {
+    return Error(
+        std::string("Unable to find shared memory region: '" + name + "'"));
+  }
+  if (it->second->kind_ == TRITONSERVER_MEMORY_CPU) {
+    *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ +
+                               it->second->offset_ + offset);
+  } else {
+    *shm_mapped_addr = (void*)((uint8_t*)it->second->mapped_addr_ + offset);
+  }
+
+  *memory_type = it->second->kind_;
+  *device_id = it->second->device_id_;
+
+  return Error::Success;
+}
+
+
+Error
+SharedMemoryManager::Unregister(
+    const std::string& name, TRITONSERVER_MemoryType memory_type)
+{
+  // Serialize all operations that write/read current shared memory regions
+  std::lock_guard<std::mutex> lock(mu_);
+
+  return UnregisterHelper(name, memory_type);
+}
+
+Error
+SharedMemoryManager::UnregisterAll(TRITONSERVER_MemoryType memory_type)
+{
+  // Serialize all operations that write/read current shared memory regions
+  std::lock_guard<std::mutex> lock(mu_);
+  std::string error_message = "Failed to unregister the following ";
+  std::vector<std::string> unregister_fails;
+
+  if (memory_type == TRITONSERVER_MEMORY_CPU) {
+    error_message += "system shared memory regions: ";
+    for (auto& it : shared_memory_map_) {
+      if (it.second->kind_ == TRITONSERVER_MEMORY_CPU) {
+        Error err = UnregisterHelper(it.first, memory_type);
+        if (!err.IsOk()) {
+          unregister_fails.push_back(it.first);
+        }
+      }
+    }
+  } else if (memory_type == TRITONSERVER_MEMORY_GPU) {
+    error_message += "cuda shared memory regions: ";
+    for (auto& it : shared_memory_map_) {
+      if (it.second->kind_ == TRITONSERVER_MEMORY_GPU) {
+        Error err = UnregisterHelper(it.first, memory_type);
+        if (!err.IsOk()) {
+          unregister_fails.push_back(it.first);
+        }
+      }
+    }
+  }
+
+  if (!unregister_fails.empty()) {
+    for (auto unreg_fail : unregister_fails) {
+      error_message += unreg_fail + " ,";
+    }
+    return Error(error_message);
+  }
+
+  return Error::Success;
+}
+
+Error
+SharedMemoryManager::UnregisterHelper(
+    const std::string& name, TRITONSERVER_MemoryType memory_type)
+{
+  // Must hold the lock on register_mu_ while calling this function.
+  auto it = shared_memory_map_.find(name);
+
+  if (it == shared_memory_map_.end()) {
+    return Error("Shared memory region " + name + " doesn't exist.");
+  }
+
+  // Remove region information from shared_memory_map_
+  shared_memory_map_.erase(it);
+
+  return Error::Success;
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/shared_memory_manager.h
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/shared_memory_manager.h
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <triton/core/tritonserver.h>
+
+#include <cstring>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "../client_backend.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+class SharedMemoryManager {
+ public:
+  SharedMemoryManager() = default;
+  ~SharedMemoryManager();
+
+#ifdef TRITON_ENABLE_GPU
+  /// Add a memory block representing memory in CUDA (GPU) memory
+  /// to the manager. Return an Error if a memory block of the same name
+  /// already exists in the manager.
+  /// \param name The name of the memory block.
+  /// \param dev_ptr The device pointer
+  /// \param byte_size The size, in bytes of the block.
+  /// \param device id The GPU number the memory region is in.
+  /// \return an Error indicating success or failure.
+  Error RegisterCUDAMemory(
+      const std::string& name, void* dev_ptr, const size_t byte_size,
+      const int device_id);
+#endif  // TRITON_ENABLE_GPU
+
+  /// Add a system memory block to the manager.
+  /// Return an Error if a shared memory block of the same name
+  /// already exists in the manager.
+  /// \param name The name of the memory block.
+  /// \param ptr The device pointer
+  /// \param byte_size The size, in bytes of the block.
+  /// \return an Error indicating success or failure.
+  Error RegisterSystemMemory(
+      const std::string& name, void* ptr, const size_t byte_size);
+
+  /// Get the access information for the shared memory block with the specified
+  /// name. Return an Error if named block doesn't exist.
+  /// \param name The name of the shared memory block to get.
+  /// \param offset The offset in the block
+  /// \param shm_mapped_addr Returns the pointer to the shared
+  /// memory block with the specified name and offset
+  /// \param memory_type Returns the type of the memory
+  /// \param device_id Returns the device id associated with the
+  /// memory block
+  /// \return an Error indicating success or failure.
+  Error GetMemoryInfo(
+      const std::string& name, size_t offset, void** shm_mapped_addr,
+      TRITONSERVER_MemoryType* memory_type, int64_t* device_id);
+
+  /// Removes the named shared memory block of the specified type from
+  /// the manager. Any future attempt to get the details of this block
+  /// will result in an array till another block with the same name is
+  /// added to the manager.
+  /// \param name The name of the shared memory block to remove.
+  /// \param memory_type The type of memory to unregister.
+  /// \return an Error indicating success or failure.
+  Error Unregister(
+      const std::string& name, TRITONSERVER_MemoryType memory_type);
+
+  /// Unregister all shared memory blocks of specified type from the manager.
+  /// \param memory_type The type of memory to unregister.
+  /// \return an Error indicating success or failure.
+  Error UnregisterAll(TRITONSERVER_MemoryType memory_type);
+
+ private:
+  /// A helper function to remove the named shared memory blocks of
+  /// specified type
+  Error UnregisterHelper(
+      const std::string& name, TRITONSERVER_MemoryType memory_type);
+
+  /// A struct that records the shared memory regions registered by the shared
+  /// memory manager.
+  struct MemoryInfo {
+    MemoryInfo(
+        const std::string& name, const size_t offset, const size_t byte_size,
+        void* mapped_addr, const TRITONSERVER_MemoryType kind,
+        const int64_t device_id)
+        : name_(name), offset_(offset), byte_size_(byte_size),
+          mapped_addr_(mapped_addr), kind_(kind), device_id_(device_id)
+    {
+    }
+
+    std::string name_;
+    size_t offset_;
+    size_t byte_size_;
+    void* mapped_addr_;
+    TRITONSERVER_MemoryType kind_;
+    int64_t device_id_;
+  };
+
+  using SharedMemoryStateMap =
+      std::map<std::string, std::unique_ptr<MemoryInfo>>;
+
+  // A map between the name and the details of the associated
+  // shared memory block
+  SharedMemoryStateMap shared_memory_map_;
+
+  // A mutex to protect the concurrent access to shared_memory_map_
+  std::mutex mu_;
+};
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "triton_c_api_backend.h"
+
+#include "c_api_infer_results.h"
+#include "json_utils.h"
+#include "triton_loader.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+//==============================================================================
+
+Error
+TritonCApiClientBackend::Create(
+    const std::string& triton_server_path,
+    const std::string& model_repository_path, const bool verbose,
+    std::unique_ptr<ClientBackend>* client_backend)
+{
+  if (triton_server_path.empty()) {
+    return Error(
+        "--triton-server-path should not be empty when using "
+        "service-kind=triton_c_api.");
+  }
+
+  if (model_repository_path.empty()) {
+    return Error(
+        "--model-repository should not be empty when using "
+        "service-kind=triton_c_api.");
+  }
+
+  std::unique_ptr<TritonCApiClientBackend> triton_client_backend(
+      new TritonCApiClientBackend());
+  TritonLoader::Create(triton_server_path, model_repository_path, verbose);
+  *client_backend = std::move(triton_client_backend);
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::ServerExtensions(std::set<std::string>* extensions)
+{
+  rapidjson::Document server_metadata_json;
+  RETURN_IF_ERROR(triton_loader_->ServerMetaData(&server_metadata_json));
+  for (const auto& extension : server_metadata_json["extensions"].GetArray()) {
+    extensions->insert(
+        std::string(extension.GetString(), extension.GetStringLength()));
+  }
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::ModelMetadata(
+    rapidjson::Document* model_metadata, const std::string& model_name,
+    const std::string& model_version)
+{
+  if (!triton_loader_->ModelIsLoaded()) {
+    triton_loader_->LoadModel(model_name, model_version);
+  }
+  RETURN_IF_ERROR(triton_loader_->ModelMetadata(model_metadata));
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::ModelConfig(
+    rapidjson::Document* model_config, const std::string& model_name,
+    const std::string& model_version)
+{
+  if (!triton_loader_->ModelIsLoaded()) {
+    triton_loader_->LoadModel(model_name, model_version);
+  }
+  RETURN_IF_ERROR(
+      triton_loader_->ModelConfig(model_config, model_name, model_version));
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::Infer(
+    cb::InferResult** result, const InferOptions& options,
+    const std::vector<InferInput*>& inputs,
+    const std::vector<const InferRequestedOutput*>& outputs)
+{
+  std::vector<tc::InferInput*> triton_inputs;
+  ParseInferInputToTriton(inputs, &triton_inputs);
+
+  std::vector<const tc::InferRequestedOutput*> triton_outputs;
+  ParseInferRequestedOutputToTriton(outputs, &triton_outputs);
+
+  tc::InferOptions triton_options(options.model_name_);
+  ParseInferOptionsToTriton(options, &triton_options);
+
+  capi::InferResult* triton_result;
+  RETURN_IF_ERROR(triton_loader_->Infer(
+      triton_options, triton_inputs, triton_outputs, &triton_result));
+
+  *result = new TritonCApiInferResult(triton_result);
+  return Error::Success;
+}
+
+
+Error
+TritonCApiClientBackend::ClientInferStat(InferStat* infer_stat)
+{
+  tc::InferStat triton_infer_stat;
+
+  triton_loader_->ClientInferStat(&triton_infer_stat);
+  ParseInferStat(triton_infer_stat, infer_stat);
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::ModelInferenceStatistics(
+    std::map<ModelIdentifier, ModelStatistics>* model_stats,
+    const std::string& model_name, const std::string& model_version)
+{
+  rapidjson::Document infer_stat_json;
+  RETURN_IF_ERROR(triton_loader_->ModelInferenceStatistics(
+      model_name, model_version, &infer_stat_json));
+  ParseStatistics(infer_stat_json, model_stats);
+
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::UnregisterAllSharedMemory()
+{
+  RETURN_IF_ERROR(triton_loader_->UnregisterAllSharedMemory());
+  return Error::Success;
+}
+
+Error
+TritonCApiClientBackend::RegisterSystemMemory(
+    const std::string& name, void* ptr, const size_t byte_size)
+{
+  RETURN_IF_ERROR(triton_loader_->RegisterSystemMemory(name, ptr, byte_size));
+  return Error::Success;
+}
+
+#ifdef TRITON_ENABLE_GPU
+Error
+TritonCApiClientBackend::RegisterCudaMemory(
+    const std::string& name, void* handle, const size_t byte_size)
+{
+  RETURN_IF_ERROR(triton_loader_->RegisterCudaMemory(name, handle, byte_size));
+  return Error::Success;
+}
+#endif  // TRITON_ENABLE_GPU
+
+void
+TritonCApiClientBackend::ParseInferInputToTriton(
+    const std::vector<InferInput*>& inputs,
+    std::vector<tc::InferInput*>* triton_inputs)
+{
+  for (const auto input : inputs) {
+    triton_inputs->push_back(
+        (dynamic_cast<TritonCApiInferInput*>(input))->Get());
+  }
+}
+
+void
+TritonCApiClientBackend::ParseInferRequestedOutputToTriton(
+    const std::vector<const InferRequestedOutput*>& outputs,
+    std::vector<const tc::InferRequestedOutput*>* triton_outputs)
+{
+  for (const auto output : outputs) {
+    triton_outputs->push_back(
+        (dynamic_cast<const TritonCApiInferRequestedOutput*>(output))->Get());
+  }
+}
+
+void
+TritonCApiClientBackend::ParseInferOptionsToTriton(
+    const InferOptions& options, tc::InferOptions* triton_options)
+{
+  triton_options->model_version_ = options.model_version_;
+  triton_options->request_id_ = options.request_id_;
+  if ((options.sequence_id_ != 0) || (options.sequence_id_str_ != "")) {
+    if (options.sequence_id_ != 0) {
+      triton_options->sequence_id_ = options.sequence_id_;
+    } else {
+      triton_options->sequence_id_str_ = options.sequence_id_str_;
+    }
+    triton_options->sequence_start_ = options.sequence_start_;
+    triton_options->sequence_end_ = options.sequence_end_;
+  }
+}
+
+void
+TritonCApiClientBackend::ParseStatistics(
+    const rapidjson::Document& infer_stat,
+    std::map<ModelIdentifier, ModelStatistics>* model_stats)
+{
+  model_stats->clear();
+  for (const auto& this_stat : infer_stat["model_stats"].GetArray()) {
+    auto it = model_stats
+                  ->emplace(
+                      std::make_pair(
+                          this_stat["name"].GetString(),
+                          this_stat["version"].GetString()),
+                      ModelStatistics())
+                  .first;
+    it->second.inference_count_ = this_stat["inference_count"].GetUint64();
+    it->second.execution_count_ = this_stat["execution_count"].GetUint64();
+    it->second.success_count_ =
+        this_stat["inference_stats"]["success"]["count"].GetUint64();
+    it->second.queue_count_ =
+        this_stat["inference_stats"]["queue"]["count"].GetUint64();
+    it->second.compute_input_count_ =
+        this_stat["inference_stats"]["compute_input"]["count"].GetUint64();
+    it->second.compute_infer_count_ =
+        this_stat["inference_stats"]["compute_infer"]["count"].GetUint64();
+    it->second.compute_output_count_ =
+        this_stat["inference_stats"]["compute_output"]["count"].GetUint64();
+    it->second.cumm_time_ns_ =
+        this_stat["inference_stats"]["success"]["ns"].GetUint64();
+    it->second.queue_time_ns_ =
+        this_stat["inference_stats"]["queue"]["ns"].GetUint64();
+    it->second.compute_input_time_ns_ =
+        this_stat["inference_stats"]["compute_input"]["ns"].GetUint64();
+    it->second.compute_infer_time_ns_ =
+        this_stat["inference_stats"]["compute_infer"]["ns"].GetUint64();
+    it->second.compute_output_time_ns_ =
+        this_stat["inference_stats"]["compute_output"]["ns"].GetUint64();
+    it->second.cache_hit_count_ =
+        this_stat["inference_stats"]["cache_hit"]["count"].GetUint64();
+    it->second.cache_hit_time_ns_ =
+        this_stat["inference_stats"]["cache_hit"]["ns"].GetUint64();
+    it->second.cache_miss_count_ =
+        this_stat["inference_stats"]["cache_miss"]["count"].GetUint64();
+    it->second.cache_miss_time_ns_ =
+        this_stat["inference_stats"]["cache_miss"]["ns"].GetUint64();
+  }
+}
+
+void
+TritonCApiClientBackend::ParseInferStat(
+    const tc::InferStat& triton_infer_stat, InferStat* infer_stat)
+{
+  infer_stat->completed_request_count =
+      triton_infer_stat.completed_request_count;
+  infer_stat->cumulative_total_request_time_ns =
+      triton_infer_stat.cumulative_total_request_time_ns;
+  infer_stat->cumulative_send_time_ns =
+      triton_infer_stat.cumulative_send_time_ns;
+  infer_stat->cumulative_receive_time_ns =
+      triton_infer_stat.cumulative_receive_time_ns;
+}
+
+//==============================================================================
+
+Error
+TritonCApiInferInput::Create(
+    InferInput** infer_input, const std::string& name,
+    const std::vector<int64_t>& dims, const std::string& datatype)
+{
+  TritonCApiInferInput* local_infer_input =
+      new TritonCApiInferInput(name, datatype);
+
+  tc::InferInput* triton_infer_input;
+  RETURN_IF_TRITON_ERROR(
+      tc::InferInput::Create(&triton_infer_input, name, dims, datatype));
+  local_infer_input->input_.reset(triton_infer_input);
+
+  *infer_input = local_infer_input;
+  return Error::Success;
+}
+
+const std::vector<int64_t>&
+TritonCApiInferInput::Shape() const
+{
+  return input_->Shape();
+}
+
+Error
+TritonCApiInferInput::SetShape(const std::vector<int64_t>& shape)
+{
+  RETURN_IF_TRITON_ERROR(input_->SetShape(shape));
+  return Error::Success;
+}
+
+Error
+TritonCApiInferInput::Reset()
+{
+  RETURN_IF_TRITON_ERROR(input_->Reset());
+  return Error::Success;
+}
+
+Error
+TritonCApiInferInput::AppendRaw(const uint8_t* input, size_t input_byte_size)
+{
+  RETURN_IF_TRITON_ERROR(input_->AppendRaw(input, input_byte_size));
+  return Error::Success;
+}
+
+Error
+TritonCApiInferInput::SetSharedMemory(
+    const std::string& name, size_t byte_size, size_t offset)
+{
+  RETURN_IF_TRITON_ERROR(input_->SetSharedMemory(name, byte_size, offset));
+  return Error::Success;
+}
+
+TritonCApiInferInput::TritonCApiInferInput(
+    const std::string& name, const std::string& datatype)
+    : InferInput(BackendKind::TRITON_C_API, name, datatype)
+{
+}
+
+
+//==============================================================================
+
+Error
+TritonCApiInferRequestedOutput::Create(
+    InferRequestedOutput** infer_output, const std::string& name,
+    const size_t class_count)
+{
+  TritonCApiInferRequestedOutput* local_infer_output =
+      new TritonCApiInferRequestedOutput(name);
+
+  tc::InferRequestedOutput* triton_infer_output;
+  RETURN_IF_TRITON_ERROR(tc::InferRequestedOutput::Create(
+      &triton_infer_output, name, class_count));
+  local_infer_output->output_.reset(triton_infer_output);
+
+  *infer_output = local_infer_output;
+
+  return Error::Success;
+}
+
+Error
+TritonCApiInferRequestedOutput::SetSharedMemory(
+    const std::string& name, size_t byte_size, size_t offset)
+{
+  RETURN_IF_TRITON_ERROR(output_->SetSharedMemory(name, byte_size, offset));
+  return Error::Success;
+}
+
+TritonCApiInferRequestedOutput::TritonCApiInferRequestedOutput(
+    const std::string& name)
+    : InferRequestedOutput(BackendKind::TRITON_C_API, name)
+{
+}
+
+//==============================================================================
+
+TritonCApiInferResult::TritonCApiInferResult(capi::InferResult* result)
+{
+  result_.reset(result);
+}
+
+Error
+TritonCApiInferResult::Id(std::string* id) const
+{
+  RETURN_IF_TRITON_ERROR(result_->Id(id));
+  return Error::Success;
+}
+
+Error
+TritonCApiInferResult::RequestStatus() const
+{
+  RETURN_IF_TRITON_ERROR(result_->RequestStatus());
+  return Error::Success;
+}
+
+Error
+TritonCApiInferResult::RawData(
+    const std::string& output_name, const uint8_t** buf,
+    size_t* byte_size) const
+{
+  return Error(
+      "Output retrieval is not currently supported for Triton C API client "
+      "backend");
+}
+
+//==============================================================================
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.h
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_c_api_backend.h
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <string>
+
+#include "../client_backend.h"
+#include "shared_memory_manager.h"
+#include "triton_loader.h"
+
+#define RETURN_IF_TRITON_ERROR(S)       \
+  do {                                  \
+    const tc::Error& status__ = (S);    \
+    if (!status__.IsOk()) {             \
+      return Error(status__.Message()); \
+    }                                   \
+  } while (false)
+
+#define FAIL_IF_TRITON_ERR(X, MSG)                                 \
+  {                                                                \
+    const tc::Error err = (X);                                     \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+
+namespace tc = triton::client;
+namespace cb = triton::perfanalyzer::clientbackend;
+namespace capi = triton::perfanalyzer::clientbackend::tritoncapi;
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+class InferResult;
+
+//==============================================================================
+/// TritonCApiClientBackend uses triton client C++ library to communicate with
+/// triton inference service. This uses the local C++ library
+///
+class TritonCApiClientBackend : public ClientBackend {
+ public:
+  /// Create a triton client backend which can be used to interact with the
+  /// server.
+  /// \param triton_server_path Tritonserver library that contains
+  /// lib/libtritonserver.so.
+  /// \param model_repository_path The model repository.
+  /// \param verbose Enables the verbose mode of TritonServer.
+  /// \param client_backend Returns a new TritonCApiClientBackend object.
+  /// \return Error object indicating success
+  /// or failure.
+  static Error Create(
+      const std::string& triton_server_path,
+      const std::string& model_repository_path, const bool verbose,
+      std::unique_ptr<ClientBackend>* client_backend);
+
+  ~TritonCApiClientBackend() { triton_loader_->Delete(); }
+
+  /// See ClientBackend::ServerExtensions()
+  Error ServerExtensions(std::set<std::string>* server_extensions) override;
+
+  /// See ClientBackend::ModelMetadata()
+  Error ModelMetadata(
+      rapidjson::Document* model_metadata, const std::string& model_name,
+      const std::string& model_version) override;
+
+  /// See ClientBackend::ModelConfig()
+  Error ModelConfig(
+      rapidjson::Document* model_config, const std::string& model_name,
+      const std::string& model_version) override;
+
+  /// See ClientBackend::Infer()
+  Error Infer(
+      cb::InferResult** result, const InferOptions& options,
+      const std::vector<InferInput*>& inputs,
+      const std::vector<const InferRequestedOutput*>& outputs) override;
+
+  /// See ClientBackend::ClientInferStat()
+  Error ClientInferStat(InferStat* infer_stat) override;
+
+  /// See ClientBackend::ModelInferenceStatistics()
+  Error ModelInferenceStatistics(
+      std::map<ModelIdentifier, ModelStatistics>* model_stats,
+      const std::string& model_name = "",
+      const std::string& model_version = "") override;
+
+#ifdef TRITON_ENABLE_GPU
+  /// See ClientBackend::RegisterCudaMemory
+  Error RegisterCudaMemory(
+      const std::string& name, void* handle, const size_t byte_size) override;
+#endif  // TRITON_ENABLE_GPU
+
+  /// See ClientBackend::RegisterSystemMemory
+  Error RegisterSystemMemory(
+      const std::string& name, void* ptr, const size_t byte_size) override;
+
+  /// See ClientBackend::UnregisterAllSharedMemory
+  Error UnregisterAllSharedMemory();
+
+ private:
+  TritonCApiClientBackend()
+      : ClientBackend(BackendKind::TRITON_C_API),
+        triton_loader_(TritonLoader::GetSingleton())
+  {
+  }
+  void ParseInferInputToTriton(
+      const std::vector<InferInput*>& inputs,
+      std::vector<tc::InferInput*>* triton_inputs);
+  void ParseInferRequestedOutputToTriton(
+      const std::vector<const InferRequestedOutput*>& outputs,
+      std::vector<const tc::InferRequestedOutput*>* triton_outputs);
+  void ParseInferOptionsToTriton(
+      const InferOptions& options, tc::InferOptions* triton_options);
+  void ParseStatistics(
+      const rapidjson::Document& infer_stat,
+      std::map<ModelIdentifier, ModelStatistics>* model_stats);
+  void ParseInferStat(
+      const tc::InferStat& triton_infer_stat, InferStat* infer_stat);
+  TritonLoader* triton_loader_;
+};
+
+//==============================================================
+/// TritonCApiInferInput is a wrapper around InferInput object of
+/// triton client library.
+///
+class TritonCApiInferInput : public InferInput {
+ public:
+  static Error Create(
+      InferInput** infer_input, const std::string& name,
+      const std::vector<int64_t>& dims, const std::string& datatype);
+
+  /// Returns the raw InferInput object required by triton client library.
+  tc::InferInput* Get() const { return input_.get(); }
+
+  /// See InferInput::Shape()
+  const std::vector<int64_t>& Shape() const override;
+
+  /// See InferInput::SetShape()
+  Error SetShape(const std::vector<int64_t>& shape) override;
+
+  /// See InferInput::Reset()
+  Error Reset() override;
+
+  /// See InferInput::AppendRaw()
+  Error AppendRaw(const uint8_t* input, size_t input_byte_size) override;
+
+  /// See InferInput::SetSharedMemory()
+  Error SetSharedMemory(
+      const std::string& name, size_t byte_size, size_t offset = 0) override;
+
+ private:
+  explicit TritonCApiInferInput(
+      const std::string& name, const std::string& datatype);
+
+  std::unique_ptr<tc::InferInput> input_;
+};
+
+//==============================================================
+/// TritonCApiInferRequestedOutput is a wrapper around
+/// InferRequestedOutput object of triton client library.
+///
+class TritonCApiInferRequestedOutput : public InferRequestedOutput {
+ public:
+  static Error Create(
+      InferRequestedOutput** infer_output, const std::string& name,
+      const size_t class_count = 0);
+  /// Returns the raw InferRequestedOutput object required by triton client
+  /// library.
+  tc::InferRequestedOutput* Get() const { return output_.get(); }
+
+  /// See InferInput::SetSharedMemory()
+  Error SetSharedMemory(
+      const std::string& name, size_t byte_size, size_t offset = 0) override;
+
+ private:
+  explicit TritonCApiInferRequestedOutput(const std::string& name);
+
+  std::unique_ptr<tc::InferRequestedOutput> output_;
+};
+
+//==============================================================
+/// TritonCApiInferResult is a wrapper around InferResult object of
+/// the C API library.
+///
+class TritonCApiInferResult : public cb::InferResult {
+ public:
+  explicit TritonCApiInferResult(capi::InferResult* result);
+  /// See InferResult::Id()
+  Error Id(std::string* id) const override;
+  /// See InferResult::RequestStatus()
+  Error RequestStatus() const override;
+  /// See InferResult::RawData()
+  Error RawData(
+      const std::string& output_name, const uint8_t** buf,
+      size_t* byte_size) const override;
+
+ private:
+  std::unique_ptr<capi::InferResult> result_;
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_loader.cc
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_loader.cc
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#define TRITON_INFERENCE_SERVER_CLIENT_CLASS \
+  triton::perfanalyzer::clientbackend::tritoncapi::TritonLoader
+
+#include "triton_loader.h"
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+#include <sys/stat.h>
+
+#include <future>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+
+#include "c_api_infer_results.h"
+#include "scoped_defer.h"
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+namespace {
+
+struct AllocPayload {
+  struct OutputInfo {
+    enum Kind { BINARY, SHM };
+
+    Kind kind_;
+    void* base_;
+    uint64_t byte_size_;
+    TRITONSERVER_MemoryType memory_type_;
+    int64_t device_id_;
+
+    // For shared memory
+    OutputInfo(
+        void* base, uint64_t byte_size, TRITONSERVER_MemoryType memory_type,
+        int64_t device_id)
+        : kind_(SHM), base_(base), byte_size_(byte_size),
+          memory_type_(memory_type), device_id_(device_id)
+    {
+    }
+  };
+
+  ~AllocPayload()
+  {
+    for (auto it : output_map_) {
+      delete it.second;
+    }
+  }
+
+  std::unordered_map<std::string, OutputInfo*> output_map_;
+};
+
+bool helper_verbose = false;
+/// Helper function for allocating memory
+TRITONSERVER_Error*
+ResponseAlloc(
+    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,
+    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,
+    int64_t preferred_memory_type_id, void* userp, void** buffer,
+    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,
+    int64_t* actual_memory_type_id)
+{
+  // Initially attempt to make the actual memory type and id that we
+  // allocate be the same as preferred memory type
+  *actual_memory_type = preferred_memory_type;
+  *actual_memory_type_id = preferred_memory_type_id;
+
+  // This variable indicates whether the buffer should be freed or not.
+  bool* should_free = new bool;
+  *buffer_userp = should_free;
+  *should_free = false;
+
+  // If 'byte_size' is zero just return 'buffer' == nullptr, we don't
+  // need to do any other book-keeping.
+  if (byte_size == 0) {
+    *buffer = nullptr;
+    *buffer_userp = nullptr;
+    if (helper_verbose) {
+      std::cout << "allocated " << byte_size << " bytes for result tensor "
+                << tensor_name << std::endl;
+    }
+  } else {
+    AllocPayload* alloc_payload = reinterpret_cast<AllocPayload*>(userp);
+    auto output_map_it = alloc_payload->output_map_.find(tensor_name);
+    if (output_map_it == alloc_payload->output_map_.end()) {
+      void* allocated_ptr = nullptr;
+      *actual_memory_type = TRITONSERVER_MEMORY_CPU;
+      *actual_memory_type_id = 0;
+      allocated_ptr = malloc(byte_size);
+      *should_free = true;
+
+      if (allocated_ptr != nullptr) {
+        *buffer = allocated_ptr;
+      }
+    } else {
+      // It is in shared memory
+      AllocPayload::OutputInfo* output_info = output_map_it->second;
+      if (byte_size > output_info->byte_size_) {
+        return TritonLoader::GetSingleton()->ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            std::string(
+                "shared memory size specified with the request for output '" +
+                std::string(tensor_name) + "' (" +
+                std::to_string(output_info->byte_size_) +
+                " bytes) should be at least " + std::to_string(byte_size) +
+                " bytes to hold the results")
+                .c_str());
+      }
+      *actual_memory_type = output_info->memory_type_;
+      *actual_memory_type_id = output_info->device_id_;
+      *buffer = output_info->base_;
+    }
+  }
+
+  return nullptr;  // Success
+}
+
+/// Helper function for releasing memory
+TRITONSERVER_Error*
+ResponseRelease(
+    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,
+    size_t byte_size, TRITONSERVER_MemoryType memory_type,
+    int64_t memory_type_id)
+{
+  bool* should_free = reinterpret_cast<bool*>(buffer_userp);
+  switch (memory_type) {
+    case TRITONSERVER_MEMORY_CPU:
+      if (*should_free) {
+        free(buffer);
+      }
+      break;
+  }
+
+  free(should_free);
+  return nullptr;  // Success
+}
+
+void
+InferRequestComplete(
+    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)
+{
+  TritonLoader::GetSingleton()->DeleteInferRequest(request);
+}
+
+
+void
+InferResponseComplete(
+    TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)
+{
+  if (response != nullptr) {
+    // Send 'response' to the future.
+    std::promise<TRITONSERVER_InferenceResponse*>* p =
+        reinterpret_cast<std::promise<TRITONSERVER_InferenceResponse*>*>(userp);
+    p->set_value(response);
+    delete p;
+  }
+}
+
+Error
+GetModelVersionFromString(const std::string& version_string, int64_t* version)
+{
+  if (version_string.empty()) {
+    *version = 1;
+    return Error::Success;
+  }
+
+  try {
+    *version = std::stol(version_string);
+  }
+  catch (std::exception& e) {
+    return Error(
+        std::string(
+            "Failed to get model version from specified version string '" +
+            version_string + "' (details: " + e.what() +
+            "), version should be an integral value > 0")
+            .c_str());
+  }
+
+  if (*version < 0) {
+    return Error(std::string(
+                     "invalid model version specified '" + version_string +
+                     "' , version should be an integral value > 0")
+                     .c_str());
+  }
+
+  return Error::Success;
+}
+
+Error
+FolderExists(const std::string& path)
+{
+  struct stat buffer;
+  if (!stat(path.c_str(), &buffer)) {
+    return Error::Success;
+  } else {
+    return Error("Unable to find filepath: " + path);
+  }
+}
+}  // namespace
+
+Error
+TritonLoader::Create(
+    const std::string& triton_server_path,
+    const std::string& model_repository_path, bool verbose)
+{
+  if (!GetSingleton()->ServerIsReady()) {
+    GetSingleton()->ClearHandles();
+    RETURN_IF_ERROR(GetSingleton()->PopulateInternals(
+        triton_server_path, model_repository_path, verbose));
+    RETURN_IF_ERROR(GetSingleton()->LoadServerLibrary());
+    RETURN_IF_ERROR(GetSingleton()->StartTriton());
+  }
+
+  return Error::Success;
+}
+
+Error
+TritonLoader::Delete()
+{
+  if (server_ != nullptr) {
+    server_is_ready_ = false;
+    model_is_loaded_ = false;
+    server_.reset();
+  }
+  return Error::Success;
+}
+
+Error
+TritonLoader::PopulateInternals(
+    const std::string& triton_server_path,
+    const std::string& model_repository_path, bool verbose)
+{
+  RETURN_IF_ERROR(FolderExists(triton_server_path));
+  RETURN_IF_ERROR(FolderExists(model_repository_path));
+
+  triton_server_path_ = triton_server_path;
+  model_repository_path_ = model_repository_path;
+  verbose_ = verbose;
+  verbose_level_ = verbose_ ? 1 : 0;
+  return Error::Success;
+}
+
+Error
+TritonLoader::StartTriton()
+{
+  // Check API version.
+  uint32_t api_version_major, api_version_minor;
+  REPORT_TRITONSERVER_ERROR(
+      api_version_fn_(&api_version_major, &api_version_minor));
+  if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) ||
+      (TRITONSERVER_API_VERSION_MINOR > api_version_minor)) {
+    std::stringstream sstream;
+    sstream << "triton server API version mismatch. \n"
+            << "Expected version major:" << TRITONSERVER_API_VERSION_MAJOR
+            << ", minor:" << TRITONSERVER_API_VERSION_MINOR << "\n"
+            << "  Actual version major:" << api_version_major
+            << ", minor:" << api_version_minor;
+    return Error(sstream.str());
+  }
+  // Create the server...
+  TRITONSERVER_ServerOptions* server_options = nullptr;
+  RETURN_IF_TRITONSERVER_ERROR(
+      options_new_fn_(&server_options), "creating server options");
+  RETURN_IF_TRITONSERVER_ERROR(
+      options_set_model_repo_path_fn_(
+          server_options, model_repository_path_.c_str()),
+      "setting model repository path");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_cuda_memory_pool_byte_size_(server_options, 0, 1073741824),
+      "setting cuda memory pool byte size failed.");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_log_verbose_fn_(server_options, verbose_level_),
+      "setting verbose logging level");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_log_info_fn_(server_options, verbose_),
+      "setting if log verbose level is true");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_backend_directory_fn_(
+          server_options, (triton_server_path_ + "/backends").c_str()),
+      "setting backend directory");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_repo_agent_directory_fn_(
+          server_options, (triton_server_path_ + "/repoagents").c_str()),
+      "setting repository agent directory");
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_strict_model_config_fn_(server_options, true),
+      "setting strict model configuration");
+  double min_compute_capability = 0;
+  // FIXME: Do not have GPU support right now
+  RETURN_IF_TRITONSERVER_ERROR(
+      set_min_supported_compute_capability_fn_(
+          server_options, min_compute_capability),
+      "setting minimum supported CUDA compute capability");
+  TRITONSERVER_Server* server_ptr = nullptr;
+  RETURN_IF_TRITONSERVER_ERROR(
+      server_new_fn_(&server_ptr, server_options), "creating server");
+  RETURN_IF_TRITONSERVER_ERROR(
+      server_options_delete_fn_(server_options), "deleting server options");
+  std::shared_ptr<TRITONSERVER_Server> shared_server(
+      server_ptr, server_delete_fn_);
+  server_ = shared_server;
+
+  // Wait until the server is both live and ready.
+  size_t health_iters = 0;
+  while (true) {
+    bool live, ready;
+    RETURN_IF_TRITONSERVER_ERROR(
+        server_is_live_fn_(server_.get(), &live),
+        "unable to get server liveness");
+    RETURN_IF_TRITONSERVER_ERROR(
+        server_is_ready_fn_(server_.get(), &ready),
+        "unable to get server readiness");
+    if (live && ready) {
+      server_is_ready_ = true;
+      break;
+    }
+
+    if (++health_iters >= 10) {
+      return Error("failed to find healthy inference server");
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+  }
+  // Print status of the server.
+  if (verbose_) {
+    TRITONSERVER_Message* server_metadata_message;
+    RETURN_IF_TRITONSERVER_ERROR(
+        server_metadata_fn_(server_.get(), &server_metadata_message),
+        "unable to get server metadata message");
+    const char* buffer;
+    size_t byte_size;
+    RETURN_IF_TRITONSERVER_ERROR(
+        message_serialize_to_json_fn_(
+            server_metadata_message, &buffer, &byte_size),
+        "unable to serialize server metadata message");
+
+    RETURN_IF_TRITONSERVER_ERROR(
+        message_delete_fn_(server_metadata_message),
+        "deleting status metadata");
+  }
+
+  return Error::Success;
+}
+
+Error
+TritonLoader::ServerMetaData(rapidjson::Document* server_metadata)
+{
+  if (!ServerIsReady()) {
+    return Error("Model is not loaded and/or server is not ready");
+  }
+  TRITONSERVER_Message* server_metadata_message;
+  RETURN_IF_TRITONSERVER_ERROR(
+      server_metadata_fn_(server_.get(), &server_metadata_message),
+      "unable to get server metadata message");
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_serialize_to_json_fn_(
+          server_metadata_message, &buffer, &byte_size),
+      "unable to serialize server metadata message");
+  server_metadata->Parse(buffer, byte_size);
+  if (server_metadata->HasParseError()) {
+    return Error(
+        "error: failed to parse server metadata from JSON: " +
+        std::string(GetParseError_En(server_metadata->GetParseError())) +
+        " at " + std::to_string(server_metadata->GetErrorOffset()));
+  }
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_delete_fn_(server_metadata_message), "deleting status metadata");
+  return Error::Success;
+}
+
+Error
+TritonLoader::LoadModel(
+    const std::string& model_name, const std::string& model_version)
+{
+  if (!ServerIsReady()) {
+    return Error("server is not ready, abort!");
+  }
+  model_name_ = model_name;
+
+  RETURN_IF_ERROR(GetModelVersionFromString(model_version, &model_version_));
+  // Wait for the model to become available.
+  bool is_ready = false;
+  size_t health_iters = 0;
+
+  // some error handling
+  if (model_repository_path_.empty()) {
+    return Error("Need to specify model repository");
+  }
+  while (!is_ready) {
+    RETURN_IF_TRITONSERVER_ERROR(
+        model_is_ready_fn_(
+            server_.get(), model_name_.c_str(), model_version_, &is_ready),
+        "unable to get model readiness");
+    if (!is_ready) {
+      if (++health_iters >= 10) {
+        return Error("model failed to be ready in 10 iterations");
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(500));
+      continue;
+    }
+  }
+  // flag to confirm model is correct and loaded
+  model_is_loaded_ = true;
+  return Error::Success;
+}
+
+Error
+TritonLoader::ModelMetadata(rapidjson::Document* model_metadata)
+{
+  if (!ModelIsLoaded() || !ServerIsReady()) {
+    return Error("Model is not loaded and/or server is not ready");
+  }
+  TRITONSERVER_Message* model_metadata_message;
+
+  // get model metadata
+  RETURN_IF_TRITONSERVER_ERROR(
+      model_metadata_fn_(
+          server_.get(), model_name_.c_str(), model_version_,
+          &model_metadata_message),
+      "unable to get model metadata message");
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_serialize_to_json_fn_(
+          model_metadata_message, &buffer, &byte_size),
+      "unable to serialize model status protobuf");
+
+  model_metadata->Parse(buffer, byte_size);
+  if (model_metadata->HasParseError()) {
+    return Error(
+        "error: failed to parse model metadata from JSON: " +
+        std::string(GetParseError_En(model_metadata->GetParseError())) +
+        " at " + std::to_string(model_metadata->GetErrorOffset()));
+  }
+
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_delete_fn_(model_metadata_message), "deleting status protobuf");
+
+  if (strcmp((*model_metadata)["name"].GetString(), model_name_.c_str())) {
+    return Error("unable to find metadata for model");
+  }
+
+  bool found_version = false;
+  if (model_metadata->HasMember("versions")) {
+    for (const auto& version : (*model_metadata)["versions"].GetArray()) {
+      if (strcmp(version.GetString(), std::to_string(model_version_).c_str()) ==
+          0) {
+        found_version = true;
+        break;
+      }
+    }
+  }
+  if (!found_version) {
+    std::string msg = "unable to find version " +
+                      std::to_string(model_version_) + " status for model";
+    return Error(msg);
+  }
+  return Error::Success;
+}
+
+Error
+TritonLoader::ModelConfig(
+    rapidjson::Document* model_config, const std::string& model_name,
+    const std::string& model_version)
+{
+  if (!ModelIsLoaded() || !ServerIsReady()) {
+    return Error("Model is not loaded and/or server is not ready");
+  }
+  TRITONSERVER_Message* model_config_message;
+  uint32_t config_version = 1;
+  RETURN_IF_TRITONSERVER_ERROR(
+      model_config_fn_(
+          (server_).get(), model_name.c_str(), model_version_, config_version,
+          &model_config_message),
+      "unable to get model config message");
+  const char* buffer;
+  size_t byte_size;
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_serialize_to_json_fn_(model_config_message, &buffer, &byte_size),
+      "unable to serialize model config status protobuf");
+
+  model_config->Parse(buffer, byte_size);
+  if (model_config->HasParseError()) {
+    return Error(
+        "error: failed to parse model config from JSON: " +
+        std::string(GetParseError_En(model_config->GetParseError())) + " at " +
+        std::to_string(model_config->GetErrorOffset()));
+  }
+
+  RETURN_IF_TRITONSERVER_ERROR(
+      message_delete_fn_(model_config_message),
+      "deleting server config status protobuf");
+
+  return Error::Success;
+}
+
+Error
+TritonLoader::LoadServerLibrary()
+{
+  std::string full_path = triton_server_path_ + server_library_path_;
+  RETURN_IF_ERROR(FolderExists(full_path));
+  RETURN_IF_ERROR(OpenLibraryHandle(full_path, &dlhandle_));
+
+  TritonServerApiVersionFn_t apifn;
+  TritonServerOptionsNewFn_t onfn;
+  TritonServerOptionSetModelRepoPathFn_t rpfn;
+  TritonServerSetLogVerboseFn_t slvfn;
+
+  TritonServerSetBackendDirFn_t sbdfn;
+  TritonServerSetRepoAgentDirFn_t srdfn;
+  TritonServerSetStrictModelConfigFn_t ssmcfn;
+  TritonServerSetMinSupportedComputeCapabilityFn_t smsccfn;
+
+  TritonServerNewFn_t snfn;
+  TritonServerOptionsDeleteFn_t odfn;
+  TritonServerDeleteFn_t sdfn;
+  TritonServerIsLiveFn_t ilfn;
+
+  TritonServerIsReadyFn_t irfn;
+  TritonServerMetadataFn_t smfn;
+  TritonServerMessageSerializeToJsonFn_t stjfn;
+  TritonServerMessageDeleteFn_t mdfn;
+
+  TritonServerModelIsReadyFn_t mirfn;
+  TritonServerModelMetadataFn_t mmfn;
+  TritonServerResponseAllocatorNewFn_t ranfn;
+  TritonServerInferenceRequestNewFn_t irnfn;
+
+  TritonServerInferenceRequestSetIdFn_t irsifn;
+  TritonServerInferenceRequestSetReleaseCallbackFn_t irsrcfn;
+  TritonServerInferenceRequestAddInputFn_t iraifn;
+  TritonServerInferenceRequestAddRequestedOutputFn_t irarofn;
+
+  TritonServerInferenceRequestAppendInputDataFn_t iraidfn;
+  TritonServerInferenceRequestSetResponseCallbackFn_t irsrescfn;
+  TritonServerInferAsyncFn_t iafn;
+  TritonServerInferenceResponseErrorFn_t irefn;
+
+  TritonServerInferenceResponseDeleteFn_t irdfn;
+  TritonServerResponseAllocatorDeleteFn_t radfn;
+  TritonServerErrorNewFn_t enfn;
+
+  TritonServerMemoryTypeStringFn_t mtsfn;
+  TritonServerInferenceResponseOutputCountFn_t irocfn;
+  TritonServerDataTypeStringFn_t dtsfn;
+
+  TritonServerErrorDeleteFn_t edfn;
+  TritonServerErrorCodeToStringFn_t ectsfn;
+  TritonServerErrorMessageFn_t emfn;
+  TritonServerModelConfigFn_t mcfn;
+  TritonServerInferenceRequestSetCorrelationIdFn_t scidfn;
+  TritonServerInferenceRequestSetStringCorrelationIdFn_t sscidfn;
+
+  TritonServerInferenceRequestSetFlagsFn_t sffn;
+  TritonServerInferenceRequestSetPriorityFn_t spfn;
+  TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t stmsfn;
+  TritonServerStringToDatatypeFn_t stdtfn;
+
+  TritonServerInferenceResponseOutputFn_t irofn;
+  TritonServerRequestIdFn_t ridfn;
+  TritonServerRequestDeleteFn_t rdfn;
+  TritonServerModelStatisticsFn_t msfn;
+
+  TritonSeverUnloadModelFn_t umfn;
+  TritonSeverSetLogInfoFn_t slifn;
+  TritonServerSetCudaMemoryPoolByteSizeFn_t scmpbsfn;
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ApiVersion", false /* optional */,
+      reinterpret_cast<void**>(&apifn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsNew", false /* optional */,
+      reinterpret_cast<void**>(&onfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetModelRepositoryPath",
+      false /* optional */, reinterpret_cast<void**>(&rpfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetLogVerbose",
+      false /* optional */, reinterpret_cast<void**>(&slvfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetBackendDirectory",
+      false /* optional */, reinterpret_cast<void**>(&sbdfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetRepoAgentDirectory",
+      false /* optional */, reinterpret_cast<void**>(&srdfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetStrictModelConfig",
+      false /* optional */, reinterpret_cast<void**>(&ssmcfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability",
+      false /* optional */, reinterpret_cast<void**>(&smsccfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize",
+      false /* optional */, reinterpret_cast<void**>(&scmpbsfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerNew", false /* optional */,
+      reinterpret_cast<void**>(&snfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsDelete", false /* optional */,
+      reinterpret_cast<void**>(&odfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerDelete", false /* optional */,
+      reinterpret_cast<void**>(&sdfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerIsLive", false /* optional */,
+      reinterpret_cast<void**>(&ilfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerIsReady", false /* optional */,
+      reinterpret_cast<void**>(&irfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerMetadata", false /* optional */,
+      reinterpret_cast<void**>(&smfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_MessageSerializeToJson", false /* optional */,
+      reinterpret_cast<void**>(&stjfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_MessageDelete", false /* optional */,
+      reinterpret_cast<void**>(&mdfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerModelIsReady", false /* optional */,
+      reinterpret_cast<void**>(&mirfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerModelMetadata", false /* optional */,
+      reinterpret_cast<void**>(&mmfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ResponseAllocatorNew", false /* optional */,
+      reinterpret_cast<void**>(&ranfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestNew", false /* optional */,
+      reinterpret_cast<void**>(&irnfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetId", false /* optional */,
+      reinterpret_cast<void**>(&irsifn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetReleaseCallback",
+      false /* optional */, reinterpret_cast<void**>(&irsrcfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestAddInput", false /* optional */,
+      reinterpret_cast<void**>(&iraifn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestAddRequestedOutput",
+      false /* optional */, reinterpret_cast<void**>(&irarofn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestAppendInputData",
+      false /* optional */, reinterpret_cast<void**>(&iraidfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetResponseCallback",
+      false /* optional */, reinterpret_cast<void**>(&irsrescfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerInferAsync", false /* optional */,
+      reinterpret_cast<void**>(&iafn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceResponseError", false /* optional */,
+      reinterpret_cast<void**>(&irefn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceResponseDelete", false /* optional */,
+      reinterpret_cast<void**>(&irdfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ResponseAllocatorDelete", false /* optional */,
+      reinterpret_cast<void**>(&radfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ErrorNew", false /* optional */,
+      reinterpret_cast<void**>(&enfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_MemoryTypeString", false /* optional */,
+      reinterpret_cast<void**>(&mtsfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceResponseOutputCount",
+      false /* optional */, reinterpret_cast<void**>(&irocfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_DataTypeString", false /* optional */,
+      reinterpret_cast<void**>(&dtsfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ErrorDelete", false /* optional */,
+      reinterpret_cast<void**>(&edfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ErrorCodeString", false /* optional */,
+      reinterpret_cast<void**>(&ectsfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ErrorMessage", false /* optional */,
+      reinterpret_cast<void**>(&emfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerModelConfig", false /* optional */,
+      reinterpret_cast<void**>(&mcfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetCorrelationId",
+      false /* optional */, reinterpret_cast<void**>(&scidfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetCorrelationIdString",
+      false /* optional */, reinterpret_cast<void**>(&sscidfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetFlags", false /* optional */,
+      reinterpret_cast<void**>(&sffn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetPriorityUInt64",
+      false /* optional */, reinterpret_cast<void**>(&spfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestSetTimeoutMicroseconds",
+      false /* optional */, reinterpret_cast<void**>(&stmsfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_StringToDataType", false /* optional */,
+      reinterpret_cast<void**>(&stdtfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceResponseOutput", false /* optional */,
+      reinterpret_cast<void**>(&irofn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestId", false /* optional */,
+      reinterpret_cast<void**>(&ridfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_InferenceRequestDelete", false /* optional */,
+      reinterpret_cast<void**>(&rdfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerModelStatistics", false /* optional */,
+      reinterpret_cast<void**>(&msfn)));
+
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerUnloadModel", false /* optional */,
+      reinterpret_cast<void**>(&umfn)));
+  RETURN_IF_ERROR(GetEntrypoint(
+      dlhandle_, "TRITONSERVER_ServerOptionsSetLogInfo", false /* optional */,
+      reinterpret_cast<void**>(&slifn)));
+
+
+  api_version_fn_ = apifn;
+  options_new_fn_ = onfn;
+  options_set_model_repo_path_fn_ = rpfn;
+  set_log_verbose_fn_ = slvfn;
+
+  set_backend_directory_fn_ = sbdfn;
+  set_repo_agent_directory_fn_ = srdfn;
+  set_strict_model_config_fn_ = ssmcfn;
+  set_min_supported_compute_capability_fn_ = smsccfn;
+
+  server_new_fn_ = snfn;
+  server_options_delete_fn_ = odfn;
+  server_delete_fn_ = sdfn;
+  server_is_live_fn_ = ilfn;
+
+  server_is_ready_fn_ = irfn;
+  server_metadata_fn_ = smfn;
+  message_serialize_to_json_fn_ = stjfn;
+  message_delete_fn_ = mdfn;
+
+  model_is_ready_fn_ = mirfn;
+  model_metadata_fn_ = mmfn;
+  response_allocator_new_fn_ = ranfn;
+  inference_request_new_fn_ = irnfn;
+
+  inference_request_set_id_fn_ = irsifn;
+  inference_request_set_release_callback_fn_ = irsrcfn;
+  inference_request_add_input_fn_ = iraifn;
+  inference_request_add_requested_output_fn_ = irarofn;
+
+  inference_request_append_input_data_fn_ = iraidfn;
+  inference_request_set_response_callback_fn_ = irsrescfn;
+  infer_async_fn_ = iafn;
+  inference_response_error_fn_ = irefn;
+
+  inference_response_delete_fn_ = irdfn;
+  response_allocator_delete_fn_ = radfn;
+  error_new_fn_ = enfn;
+
+  memory_type_string_fn_ = mtsfn;
+  inference_response_output_count_fn_ = irocfn;
+  data_type_string_fn_ = dtsfn;
+
+  error_delete_fn_ = edfn;
+  error_code_to_string_fn_ = ectsfn;
+  error_message_fn_ = emfn;
+  model_config_fn_ = mcfn;
+  set_correlation_id_fn_ = scidfn;
+  set_string_correlation_id_fn_ = sscidfn;
+
+  set_flags_fn_ = sffn;
+  set_priority_fn_ = spfn;
+  set_timeout_ms_fn_ = stmsfn;
+  string_to_datatype_fn_ = stdtfn;
+
+  inference_response_output_fn_ = irofn;
+  request_id_fn_ = ridfn;
+  request_delete_fn_ = rdfn;
+  model_statistics_fn_ = msfn;
+
+  unload_model_fn_ = umfn;
+  set_log_info_fn_ = slifn;
+  set_cuda_memory_pool_byte_size_ = scmpbsfn;
+
+  return Error::Success;
+}
+
+void
+TritonLoader::ClearHandles()
+{
+  dlhandle_ = nullptr;
+
+  api_version_fn_ = nullptr;
+  options_new_fn_ = nullptr;
+  options_set_model_repo_path_fn_ = nullptr;
+  set_log_verbose_fn_ = nullptr;
+
+  set_backend_directory_fn_ = nullptr;
+  set_repo_agent_directory_fn_ = nullptr;
+  set_strict_model_config_fn_ = nullptr;
+  set_min_supported_compute_capability_fn_ = nullptr;
+
+  server_new_fn_ = nullptr;
+  server_options_delete_fn_ = nullptr;
+  server_delete_fn_ = nullptr;
+  server_is_live_fn_ = nullptr;
+
+  server_is_ready_fn_ = nullptr;
+  server_metadata_fn_ = nullptr;
+  message_serialize_to_json_fn_ = nullptr;
+  message_delete_fn_ = nullptr;
+
+  model_is_ready_fn_ = nullptr;
+  model_metadata_fn_ = nullptr;
+  response_allocator_new_fn_ = nullptr;
+  inference_request_new_fn_ = nullptr;
+
+  inference_request_set_id_fn_ = nullptr;
+  inference_request_set_release_callback_fn_ = nullptr;
+  inference_request_add_input_fn_ = nullptr;
+  inference_request_add_requested_output_fn_ = nullptr;
+
+  inference_request_append_input_data_fn_ = nullptr;
+  inference_request_set_response_callback_fn_ = nullptr;
+  infer_async_fn_ = nullptr;
+  inference_response_error_fn_ = nullptr;
+
+  inference_response_delete_fn_ = nullptr;
+  response_allocator_delete_fn_ = nullptr;
+  error_new_fn_ = nullptr;
+
+  memory_type_string_fn_ = nullptr;
+  inference_response_output_count_fn_ = nullptr;
+  data_type_string_fn_ = nullptr;
+  error_message_fn_ = nullptr;
+
+  error_delete_fn_ = nullptr;
+  error_code_to_string_fn_ = nullptr;
+  model_config_fn_ = nullptr;
+  set_correlation_id_fn_ = nullptr;
+  set_string_correlation_id_fn_ = nullptr;
+
+  set_flags_fn_ = nullptr;
+  set_priority_fn_ = nullptr;
+  set_timeout_ms_fn_ = nullptr;
+  string_to_datatype_fn_ = nullptr;
+
+  inference_response_output_fn_ = nullptr;
+  request_id_fn_ = nullptr;
+  request_delete_fn_ = nullptr;
+  model_statistics_fn_ = nullptr;
+  unload_model_fn_ = nullptr;
+  set_log_info_fn_ = nullptr;
+}
+
+Error
+TritonLoader::FileExists(std::string& filepath)
+{
+  std::ifstream ifile;
+  ifile.open(filepath);
+  if (!ifile) {
+    return Error("unable to find local Triton library: " + filepath);
+  } else {
+    return Error::Success;
+  }
+}
+
+Error
+TritonLoader::Infer(
+    const tc::InferOptions& options, const std::vector<tc::InferInput*>& inputs,
+    const std::vector<const tc::InferRequestedOutput*>& outputs,
+    InferResult** result)
+{
+  Error error = Error::Success;
+  if (!ServerIsReady() || !ModelIsLoaded()) {
+    return Error("Server is not ready and/or requested model is not loaded");
+  }
+
+  TRITONSERVER_ResponseAllocator* allocator = nullptr;
+  TRITONSERVER_InferenceRequest* irequest = nullptr;
+  TRITONSERVER_InferenceResponse* completed_response = nullptr;
+  tc::RequestTimers timer;
+  timer.Reset();
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_START);
+
+  RETURN_IF_ERROR(InitializeRequest(options, outputs, &allocator, &irequest));
+  ScopedDefer error_handler([&error, &completed_response, &allocator, this] {
+    error = CleanUp(completed_response, allocator);
+  });
+  RETURN_IF_ERROR(AddInputs(inputs, irequest));
+  RETURN_IF_ERROR(AddOutputs(outputs, irequest));
+
+  AllocPayload alloc_payload;
+  for (auto& output : outputs) {
+    if (output->IsSharedMemory()) {
+      std::string shm_name;
+      size_t shm_byte_size;
+      size_t offset;
+      // TODO: Error handling
+      output->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset);
+
+      void* buf;
+      TRITONSERVER_MemoryType memory_type;
+      int64_t memory_type_id;
+      RETURN_IF_ERROR(shm_manager_->GetMemoryInfo(
+          shm_name, offset, &buf, &memory_type, &memory_type_id));
+
+      alloc_payload.output_map_.emplace(
+          std::piecewise_construct, std::forward_as_tuple(output->Name()),
+          std::forward_as_tuple(new AllocPayload::OutputInfo(
+              buf, shm_byte_size, memory_type, memory_type_id)));
+    }
+  }
+
+  const char* cid = nullptr;
+  RETURN_IF_TRITONSERVER_ERROR(
+      request_id_fn_(irequest, &cid), "Failed to get request id");
+  std::string id = cid;
+
+  // Perform inference...
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_START);
+  auto p = new std::promise<TRITONSERVER_InferenceResponse*>();
+  std::future<TRITONSERVER_InferenceResponse*> completed = p->get_future();
+  RETURN_IF_TRITONSERVER_ERROR(
+      inference_request_set_response_callback_fn_(
+          irequest, allocator, &alloc_payload /* response_allocator_userp */,
+          InferResponseComplete, reinterpret_cast<void*>(p)),
+      "setting response callback");
+  RETURN_IF_TRITONSERVER_ERROR(
+      infer_async_fn_((server_).get(), irequest, nullptr /* trace */),
+      "running inference");
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::SEND_END);
+
+  // Wait for the inference to complete.
+  completed_response = completed.get();
+
+  RETURN_IF_TRITONSERVER_ERROR(
+      inference_response_error_fn_(completed_response),
+      "inference response error");
+
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_START);
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::RECV_END);
+  timer.CaptureTimestamp(tc::RequestTimers::Kind::REQUEST_END);
+
+  tc::Error err = UpdateInferStat(timer);
+  if (!err.IsOk()) {
+    std::cerr << "Failed to update context stat: " << err << std::endl;
+  }
+
+  InferResult::Create(result, err, id);
+
+  // CleanUp the response allocators
+  error_handler.Complete();
+
+  return error;
+}
+
+Error
+TritonLoader::CleanUp(
+    TRITONSERVER_InferenceResponse* completed_response,
+    TRITONSERVER_ResponseAllocator* allocator)
+{
+  TRITONSERVER_Error* response_err = nullptr;
+  if (completed_response != nullptr) {
+    response_err = inference_response_delete_fn_(completed_response);
+  }
+  TRITONSERVER_Error* allocator_err = response_allocator_delete_fn_(allocator);
+  RETURN_IF_TRITONSERVER_ERROR(response_err, "deleting inference response");
+  RETURN_IF_TRITONSERVER_ERROR(allocator_err, "deleting response allocator");
+  return Error::Success;
+}
+
+Error
+TritonLoader::InitializeRequest(
+    const tc::InferOptions& options,
+    const std::vector<const tc::InferRequestedOutput*>& outputs,
+    TRITONSERVER_ResponseAllocator** allocator,
+    TRITONSERVER_InferenceRequest** irequest)
+{
+  // Create the allocator that will be used to allocate buffers for
+  // the result tensors.
+  RETURN_IF_TRITONSERVER_ERROR(
+      GetSingleton()->response_allocator_new_fn_(
+          allocator,
+          reinterpret_cast<
+              TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator,
+                                      const char* tensor_name, size_t byte_size,
+                                      TRITONSERVER_MemoryType memory_type,
+                                      int64_t memory_type_id, void* userp,
+                                      void** buffer, void** buffer_userp,
+                                      TRITONSERVER_MemoryType*
+                                          actual_memory_type,
+                                      int64_t* actual_memory_type_id)>(
+              ResponseAlloc),
+          reinterpret_cast<
+              TRITONSERVER_Error* (*)(TRITONSERVER_ResponseAllocator* allocator,
+                                      void* buffer, void* buffer_userp,
+                                      size_t byte_size,
+                                      TRITONSERVER_MemoryType memory_type,
+                                      int64_t memory_type_id)>(ResponseRelease),
+          nullptr /* start_fn */),
+      "creating response allocator");
+
+  // set up inference request
+  RETURN_IF_TRITONSERVER_ERROR(
+      inference_request_new_fn_(
+          irequest, (server_).get(), model_name_.c_str(), model_version_),
+      "creating inference request");
+  RETURN_IF_TRITONSERVER_ERROR(
+      inference_request_set_id_fn_(*irequest, options.request_id_.c_str()),
+      "setting ID for the request");
+  if ((options.sequence_id_ != 0) || (options.sequence_id_str_ != "") ||
+      (options.priority_ != 0) || (options.server_timeout_ != 0) ||
+      outputs.empty()) {
+    if (options.sequence_id_ != 0) {
+      RETURN_IF_TRITONSERVER_ERROR(
+          set_correlation_id_fn_(*irequest, options.sequence_id_),
+          "setting sequence ID for the request");
+    } else if (options.sequence_id_str_ != "") {
+      RETURN_IF_TRITONSERVER_ERROR(
+          set_string_correlation_id_fn_(
+              *irequest, options.sequence_id_str_.c_str()),
+          "setting sequence ID for the request");
+    }
+    uint32_t flags = 0;
+    if (options.sequence_start_) {
+      flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_START;
+    }
+    if (options.sequence_end_) {
+      flags |= TRITONSERVER_REQUEST_FLAG_SEQUENCE_END;
+    }
+    RETURN_IF_TRITONSERVER_ERROR(
+        set_flags_fn_(*irequest, flags),
+        "setting inference flags for the request");
+  }
+  if (options.priority_ != 0) {
+    RETURN_IF_TRITONSERVER_ERROR(
+        set_priority_fn_(*irequest, options.priority_),
+        "setting priority for the request");
+  }
+  if (options.server_timeout_ != 0) {
+    RETURN_IF_TRITONSERVER_ERROR(
+        set_timeout_ms_fn_(*irequest, options.server_timeout_),
+        "setting timeout for the request");
+  }
+  RETURN_IF_TRITONSERVER_ERROR(
+      inference_request_set_release_callback_fn_(
+          *irequest, InferRequestComplete, nullptr /* request_release_userp */),
+      "setting request release callback");
+  return Error::Success;
+}
+
+Error
+TritonLoader::AddInputs(
+    const std::vector<tc::InferInput*>& inputs,
+    TRITONSERVER_InferenceRequest* irequest)
+{
+  for (auto io : inputs) {
+    const char* input_name = io->Name().c_str();
+    const char* datatype = io->Datatype().c_str();
+    const TRITONSERVER_DataType dtype = string_to_datatype_fn_(datatype);
+    std::vector<int64_t> shape_vec;
+    for (const int64_t dim : io->Shape()) {  // this is a vector, just use it
+      shape_vec.push_back(dim);
+    }
+
+    RETURN_IF_TRITONSERVER_ERROR(
+        inference_request_add_input_fn_(
+            irequest, input_name, dtype, &shape_vec[0], shape_vec.size()),
+        "setting input for the request");
+    size_t byte_size;
+    tc::Error err = io->ByteSize(&byte_size);
+    if (!err.IsOk()) {
+      return Error(err.Message());
+    }
+    if (byte_size == 0) {
+      RETURN_IF_TRITONSERVER_ERROR(
+          inference_request_append_input_data_fn_(
+              irequest, input_name, nullptr, 0 /* byte_size */,
+              TRITONSERVER_MEMORY_CPU /* memory type */,
+              0 /* memory_type_id */),
+          "appending input data with byte size zero");
+    } else {
+      if (!io->IsSharedMemory()) {
+        io->PrepareForRequest();
+        bool end_of_input = false;
+        while (!end_of_input) {
+          const uint8_t* buf;
+          size_t buf_size;
+          io->GetNext(&buf, &buf_size, &end_of_input);
+          if (buf != nullptr) {
+            RETURN_IF_TRITONSERVER_ERROR(
+                inference_request_append_input_data_fn_(
+                    irequest, input_name, const_cast<uint8_t*>(buf), buf_size,
+                    TRITONSERVER_MEMORY_CPU /* memory_type */,
+                    0 /* memory_type_id */),
+                "appending data to tritonserver");
+          }
+        }
+      } else {
+        std::string shm_name;
+        size_t shm_byte_size;
+        size_t offset;
+        // TODO: Error handling
+        io->SharedMemoryInfo(&shm_name, &shm_byte_size, &offset);
+        void* buf;
+        TRITONSERVER_MemoryType memory_type;
+        int64_t memory_type_id;
+        RETURN_IF_ERROR(shm_manager_->GetMemoryInfo(
+            shm_name, offset, &buf, &memory_type, &memory_type_id));
+        RETURN_IF_TRITONSERVER_ERROR(
+            inference_request_append_input_data_fn_(
+                irequest, input_name, buf, byte_size,
+                memory_type /* memory_type */,
+                memory_type_id /* memory_type_id */),
+            "appending data to tritonserver");
+      }
+    }
+  }
+
+
+  return Error::Success;
+}
+
+Error
+TritonLoader::AddOutputs(
+    const std::vector<const tc::InferRequestedOutput*>& outputs,
+    TRITONSERVER_InferenceRequest* irequest)
+{
+  for (auto io : outputs) {
+    const char* output_name = io->Name().c_str();
+    RETURN_IF_TRITONSERVER_ERROR(
+        inference_request_add_requested_output_fn_(irequest, output_name),
+        "setting output for the request");
+  }
+  return Error::Success;
+}
+
+
+Error
+TritonLoader::ModelInferenceStatistics(
+    const std::string& model_name, const std::string& model_version,
+    rapidjson::Document* infer_stat)
+{
+  if (ServerIsReady() && ModelIsLoaded()) {
+    TRITONSERVER_Message* model_stats_message = nullptr;
+    int64_t requested_model_version;
+    auto err =
+        GetModelVersionFromString(model_version, &requested_model_version);
+    if (err.IsOk()) {
+      RETURN_IF_TRITONSERVER_ERROR(
+          model_statistics_fn_(
+              (server_).get(), model_name.c_str(), requested_model_version,
+              &model_stats_message),
+          "getting model statistics from server");
+
+      const char* buffer;
+      size_t byte_size;
+      RETURN_IF_TRITONSERVER_ERROR(
+          message_serialize_to_json_fn_(
+              model_stats_message, &buffer, &byte_size),
+          "serializing message to json");
+
+      infer_stat->Parse(buffer, byte_size);
+      if (infer_stat->HasParseError()) {
+        return Error(
+            "error: failed to parse server metadata from JSON: " +
+            std::string(GetParseError_En(infer_stat->GetParseError())) +
+            " at " + std::to_string(infer_stat->GetErrorOffset()));
+      }
+      RETURN_IF_TRITONSERVER_ERROR(
+          message_delete_fn_(model_stats_message),
+          "deleting inference statistics message");
+    }
+    return err;
+  } else {
+    return Error(
+        "Trying to get model statistics while server is not started or model "
+        "is not ready");
+  }
+}
+
+TritonLoader*
+TritonLoader::GetSingleton()
+{
+  static TritonLoader loader;
+  return &loader;
+}
+
+TritonLoader::~TritonLoader()
+{
+  FAIL_IF_ERR(Delete(), "dereferencing server instance...");
+  FAIL_IF_ERR(CloseLibraryHandle(dlhandle_), "error on closing triton loader");
+  ClearHandles();
+}
+
+#ifdef TRITON_ENABLE_GPU
+Error
+TritonLoader::RegisterCudaMemory(
+    const std::string& name, void* handle, const size_t byte_size)
+{
+  RETURN_IF_ERROR(shm_manager_->RegisterCUDAMemory(
+      name, handle, byte_size, 0 /* device id */));
+  return Error::Success;
+}
+#endif  // TRITON_ENABLE_GPU
+
+Error
+TritonLoader::RegisterSystemMemory(
+    const std::string& name, void* ptr, const size_t byte_size)
+{
+  RETURN_IF_ERROR(shm_manager_->RegisterSystemMemory(name, ptr, byte_size));
+  return Error::Success;
+}
+
+Error
+TritonLoader::UnregisterAllSharedMemory()
+{
+  RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU));
+  RETURN_IF_ERROR(shm_manager_->UnregisterAll(TRITONSERVER_MEMORY_GPU));
+  return Error::Success;
+}
+
+TRITONSERVER_Error*
+TritonLoader::ErrorNew(TRITONSERVER_Error_Code code, const char* message)
+{
+  return error_new_fn_(code, message);
+}
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/client_backend/triton_c_api/triton_loader.h
+++ b/src/c++/perf_analyzer/client_backend/triton_c_api/triton_loader.h
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <rapidjson/document.h>
+#include <rapidjson/error/en.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "../client_backend.h"
+#include "common.h"
+#include "shared_library.h"
+#include "shared_memory_manager.h"
+#include "triton/core/tritonserver.h"
+
+// If TRITONSERVER error is non-OK, return the corresponding status.
+#define RETURN_IF_TRITONSERVER_ERROR(E, MSG)                                \
+  do {                                                                      \
+    TRITONSERVER_Error* err__ = (E);                                        \
+    if (err__ != nullptr) {                                                 \
+      std::cout << "error: " << (MSG) << ": "                               \
+                << GetSingleton()->error_code_to_string_fn_(err__) << " - " \
+                << GetSingleton()->error_message_fn_(err__) << std::endl;   \
+      Error newErr = Error(MSG);                                            \
+      GetSingleton()->error_delete_fn_(err__);                              \
+      return newErr;                                                        \
+    }                                                                       \
+  } while (false)
+
+#define FAIL_IF_TRITONSERVER_ERROR(E, MSG)                                  \
+  do {                                                                      \
+    TRITONSERVER_Error* err__ = (E);                                        \
+    if (err__ != nullptr) {                                                 \
+      std::cerr << "error: " << (MSG) << ": "                               \
+                << GetSingleton()->error_code_to_string_fn_(err__) << " - " \
+                << GetSingleton()->error_message_fn_(err__) << std::endl;   \
+      Error newErr = Error(MSG);                                            \
+      GetSingleton()->error_delete_fn_(err__);                              \
+      exit(1);                                                              \
+    }                                                                       \
+  } while (false)
+
+#define REPORT_TRITONSERVER_ERROR(E)                                      \
+  do {                                                                    \
+    TRITONSERVER_Error* err__ = (E);                                      \
+    if (err__ != nullptr) {                                               \
+      std::cout << GetSingleton()->error_message_fn_(err__) << std::endl; \
+      GetSingleton()->error_delete_fn_(err__);                            \
+    }                                                                     \
+  } while (false)
+
+namespace tc = triton::client;
+
+namespace triton { namespace perfanalyzer { namespace clientbackend {
+namespace tritoncapi {
+
+class InferResult;
+
+class TritonLoader : public tc::InferenceServerClient {
+ public:
+  ~TritonLoader();
+
+  static Error Create(
+      const std::string& triton_server_path,
+      const std::string& model_repository_path, bool verbose);
+
+  Error Delete();
+  Error StartTriton();
+
+  Error LoadModel(
+      const std::string& model_name, const std::string& model_version);
+
+  Error ModelMetadata(rapidjson::Document* model_metadata);
+
+  Error ModelConfig(
+      rapidjson::Document* model_config, const std::string& model_name,
+      const std::string& model_version);
+
+  Error ServerMetaData(rapidjson::Document* server_metadata);
+
+  Error Infer(
+      const tc::InferOptions& options,
+      const std::vector<tc::InferInput*>& inputs,
+      const std::vector<const tc::InferRequestedOutput*>& outputs,
+      InferResult** result);
+
+  Error CleanUp(
+      TRITONSERVER_InferenceResponse* completed_response,
+      TRITONSERVER_ResponseAllocator* allocator);
+
+  Error ModelInferenceStatistics(
+      const std::string& model_name, const std::string& model_version,
+      rapidjson::Document* infer_stat);
+
+  Error ClientInferStat(tc::InferStat* infer_stat)
+  {
+    *infer_stat = infer_stat_;
+    return Error::Success;
+  }
+
+#ifdef TRITON_ENABLE_GPU
+  Error RegisterCudaMemory(
+      const std::string& name, void* handle, const size_t byte_size);
+#endif  // TRITON_ENABLE_GPU
+
+  Error RegisterSystemMemory(
+      const std::string& name, void* ptr, const size_t byte_size);
+
+  Error UnregisterAllSharedMemory();
+
+  TRITONSERVER_Error* ErrorNew(
+      TRITONSERVER_Error_Code code, const char* message);
+
+  bool ModelIsLoaded() { return model_is_loaded_; }
+  bool ServerIsReady() { return server_is_ready_; }
+
+  TRITONSERVER_Error* DeleteInferRequest(
+      TRITONSERVER_InferenceRequest* irequest)
+  {
+    return request_delete_fn_(irequest);
+  }
+  static TritonLoader* GetSingleton();
+
+  // TRITONSERVER_ApiVersion
+  typedef TRITONSERVER_Error* (*TritonServerApiVersionFn_t)(
+      uint32_t* major, uint32_t* minor);
+  // TRITONSERVER_ServerOptionsNew
+  typedef TRITONSERVER_Error* (*TritonServerOptionsNewFn_t)(
+      TRITONSERVER_ServerOptions** options);
+  // TRITONSERVER_ServerOptionsSetModelRepositoryPath
+  typedef TRITONSERVER_Error* (*TritonServerOptionSetModelRepoPathFn_t)(
+      TRITONSERVER_ServerOptions* options, const char* model_repository_path);
+  // TRITONSERVER_ServerOptionsSetLogVerbose
+  typedef TRITONSERVER_Error* (*TritonServerSetLogVerboseFn_t)(
+      TRITONSERVER_ServerOptions* options, int level);
+
+  // TRITONSERVER_ServerOptionsSetBackendDirectory
+  typedef TRITONSERVER_Error* (*TritonServerSetBackendDirFn_t)(
+      TRITONSERVER_ServerOptions* options, const char* backend_dir);
+
+  // TRITONSERVER_ServerOptionsSetRepoAgentDirectory
+  typedef TRITONSERVER_Error* (*TritonServerSetRepoAgentDirFn_t)(
+      TRITONSERVER_ServerOptions* options, const char* repoagent_dir);
+
+  // TRITONSERVER_ServerOptionsSetStrictModelConfig
+  typedef TRITONSERVER_Error* (*TritonServerSetStrictModelConfigFn_t)(
+      TRITONSERVER_ServerOptions* options, bool strict);
+
+  // TRITONSERVER_ServerOptionsSetMinSupportedComputeCapability
+  typedef TRITONSERVER_Error* (
+      *TritonServerSetMinSupportedComputeCapabilityFn_t)(
+      TRITONSERVER_ServerOptions* options, double cc);
+
+  // TRITONSERVER_ServerNew
+  typedef TRITONSERVER_Error* (*TritonServerNewFn_t)(
+      TRITONSERVER_Server** server, TRITONSERVER_ServerOptions* option);
+
+  // TRITONSERVER_ServerOptionsDelete
+  typedef TRITONSERVER_Error* (*TritonServerOptionsDeleteFn_t)(
+      TRITONSERVER_ServerOptions* options);
+
+  // TRITONSERVER_ServerDelete
+  typedef TRITONSERVER_Error* (*TritonServerDeleteFn_t)(
+      TRITONSERVER_Server* server);
+
+  // TRITONSERVER_ServerIsLive
+  typedef TRITONSERVER_Error* (*TritonServerIsLiveFn_t)(
+      TRITONSERVER_Server* server, bool* live);
+
+  // TRITONSERVER_ServerIsReady
+  typedef TRITONSERVER_Error* (*TritonServerIsReadyFn_t)(
+      TRITONSERVER_Server* server, bool* ready);
+
+  // TRITONSERVER_ServerMetadata
+  typedef TRITONSERVER_Error* (*TritonServerMetadataFn_t)(
+      TRITONSERVER_Server* server, TRITONSERVER_Message** server_metadata);
+
+  // TRITONSERVER_MessageSerializeToJson
+  typedef TRITONSERVER_Error* (*TritonServerMessageSerializeToJsonFn_t)(
+      TRITONSERVER_Message* message, const char** base, size_t* byte_size);
+
+  // TRITONSERVER_MessageDelete
+  typedef TRITONSERVER_Error* (*TritonServerMessageDeleteFn_t)(
+      TRITONSERVER_Message* message);
+
+  // TRITONSERVER_ServerModelIsReady
+  typedef TRITONSERVER_Error* (*TritonServerModelIsReadyFn_t)(
+      TRITONSERVER_Server* server, const char* model_name,
+      const int64_t model_version, bool* ready);
+
+  // TRITONSERVER_ServerModelMetadata
+  typedef TRITONSERVER_Error* (*TritonServerModelMetadataFn_t)(
+      TRITONSERVER_Server* server, const char* model_name,
+      const int64_t model_version, TRITONSERVER_Message** model_metadata);
+
+  // TRITONSERVER_ResponseAllocatorNew
+  typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorNewFn_t)(
+      TRITONSERVER_ResponseAllocator** allocator,
+      TRITONSERVER_ResponseAllocatorAllocFn_t alloc_fn,
+      TRITONSERVER_ResponseAllocatorReleaseFn_t release_fn,
+      TRITONSERVER_ResponseAllocatorStartFn_t start_fn);
+
+  // TRITONSERVER_InferenceRequestNew
+  typedef TRITONSERVER_Error* (*TritonServerInferenceRequestNewFn_t)(
+      TRITONSERVER_InferenceRequest** inference_request,
+      TRITONSERVER_Server* server, const char* model_name,
+      const int64_t model_version);
+
+  // TRITONSERVER_InferenceRequestSetId
+  typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetIdFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char* id);
+
+  // TRITONSERVER_InferenceRequestSetReleaseCallback
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestSetReleaseCallbackFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request,
+      TRITONSERVER_InferenceRequestReleaseFn_t request_release_fn,
+      void* request_release_userp);
+
+  // TRITONSERVER_InferenceRequestAddInput
+  typedef TRITONSERVER_Error* (*TritonServerInferenceRequestAddInputFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char* name,
+      const TRITONSERVER_DataType datatype, const int64_t* shape,
+      uint64_t dim_count);
+
+  // TRITONSERVER_InferenceRequestAddRequestedOutput
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestAddRequestedOutputFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+  // TRITONSERVER_InferenceRequestAppendInputData
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestAppendInputDataFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char* name,
+      const void* base, size_t byte_size, TRITONSERVER_MemoryType memory_type,
+      int64_t memory_type_i);
+
+  // TRITONSERVER_InferenceRequestSetResponseCallback
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestSetResponseCallbackFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request,
+      TRITONSERVER_ResponseAllocator* response_allocator,
+      void* response_allocator_userp,
+      TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
+      void* response_userp);
+
+  // TRITONSERVER_ServerInferAsync
+  typedef TRITONSERVER_Error* (*TritonServerInferAsyncFn_t)(
+      TRITONSERVER_Server* server,
+      TRITONSERVER_InferenceRequest* inference_request,
+      TRITONSERVER_InferenceTrace* trace);
+
+  // TRITONSERVER_InferenceResponseError
+  typedef TRITONSERVER_Error* (*TritonServerInferenceResponseErrorFn_t)(
+      TRITONSERVER_InferenceResponse* inference_response);
+
+  // TRITONSERVER_InferenceResponseDelete
+  typedef TRITONSERVER_Error* (*TritonServerInferenceResponseDeleteFn_t)(
+      TRITONSERVER_InferenceResponse* inference_response);
+
+  // TRITONSERVER_InferenceRequestRemoveAllInputData
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestRemoveAllInputDataFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char* name);
+
+  // TRITONSERVER_ResponseAllocatorDelete
+  typedef TRITONSERVER_Error* (*TritonServerResponseAllocatorDeleteFn_t)(
+      TRITONSERVER_ResponseAllocator* allocator);
+
+  // TRITONSERVER_ErrorNew
+  typedef TRITONSERVER_Error* (*TritonServerErrorNewFn_t)(
+      TRITONSERVER_Error_Code code, const char* msg);
+
+  // TRITONSERVER_MemoryTypeString
+  typedef const char* (*TritonServerMemoryTypeStringFn_t)(
+      TRITONSERVER_MemoryType memtype);
+
+  // TRITONSERVER_InferenceResponseOutputCount
+  typedef TRITONSERVER_Error* (*TritonServerInferenceResponseOutputCountFn_t)(
+      TRITONSERVER_InferenceResponse* inference_response, uint32_t* count);
+
+  // TRITONSERVER_DataTypeString
+  typedef const char* (*TritonServerDataTypeStringFn_t)(
+      TRITONSERVER_DataType datatype);
+
+  // TRITONSERVER_ErrorMessage
+  typedef const char* (*TritonServerErrorMessageFn_t)(
+      TRITONSERVER_Error* error);
+
+  // TRITONSERVER_ErrorDelete
+  typedef void (*TritonServerErrorDeleteFn_t)(TRITONSERVER_Error* error);
+
+  // TRITONSERVER_ErrorCodeString
+  typedef const char* (*TritonServerErrorCodeToStringFn_t)(
+      TRITONSERVER_Error* error);
+
+  // TRITONSERVER_ServerModelConfig
+  typedef TRITONSERVER_Error* (*TritonServerModelConfigFn_t)(
+      TRITONSERVER_Server* server, const char* model_name,
+      const int64_t model_version, const uint32_t config_version,
+      TRITONSERVER_Message** model_config);
+
+  // TRITONSERVER_InferenceRequestSetCorrelationId
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestSetCorrelationIdFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request,
+      uint64_t correlation_id);
+
+  // TRITONSERVER_InferenceRequestSetCorrelationId
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestSetStringCorrelationIdFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request,
+      const char* correlation_id);
+
+  // TRITONSERVER_InferenceRequestSetFlags
+  typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetFlagsFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, uint32_t flags);
+
+  // TRITONSERVER_InferenceRequestSetPriorityUInt64
+  typedef TRITONSERVER_Error* (*TritonServerInferenceRequestSetPriorityFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, uint64_t priority);
+
+  // TRITONSERVER_InferenceRequestSetTimeoutMicroseconds
+  typedef TRITONSERVER_Error* (
+      *TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, uint64_t timeout_us);
+
+  // TRITONSERVER_StringToDataType
+  typedef TRITONSERVER_DataType (*TritonServerStringToDatatypeFn_t)(
+      const char* dtype);
+
+  // TRITONSERVER_InferenceResponseOutput
+  typedef TRITONSERVER_Error* (*TritonServerInferenceResponseOutputFn_t)(
+      TRITONSERVER_InferenceResponse* inference_response, const uint32_t index,
+      const char** name, TRITONSERVER_DataType* datatype, const int64_t** shape,
+      uint64_t* dim_count, const void** base, size_t* byte_size,
+      TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
+      void** userp);
+
+  // TRITONSERVER_InferenceRequestId
+  typedef TRITONSERVER_Error* (*TritonServerRequestIdFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request, const char** id);
+
+  // TRITONSERVER_InferenceRequestDelete
+  typedef TRITONSERVER_Error* (*TritonServerRequestDeleteFn_t)(
+      TRITONSERVER_InferenceRequest* inference_request);
+
+  // TRITONSERVER_ServerModelStatistics
+  typedef TRITONSERVER_Error* (*TritonServerModelStatisticsFn_t)(
+      TRITONSERVER_Server* server, const char* model_name,
+      const int64_t model_version, TRITONSERVER_Message** model_stats);
+
+  // TRITONSERVER_ServerUnloadModel
+  typedef TRITONSERVER_Error* (*TritonSeverUnloadModelFn_t)(
+      TRITONSERVER_Server* server, const char* model_name);
+
+  // TRITONSERVER_ServerOptionsSetLogInfo
+  typedef TRITONSERVER_Error* (*TritonSeverSetLogInfoFn_t)(
+      TRITONSERVER_ServerOptions* options, bool log);
+
+  // TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize
+  typedef TRITONSERVER_Error* (*TritonServerSetCudaMemoryPoolByteSizeFn_t)(
+      TRITONSERVER_ServerOptions* options, int gpu_device, uint64_t size);
+
+ private:
+  TritonLoader()
+      : InferenceServerClient(
+            false /* verbose flag that is set later during ::Create*/)
+  {
+    verbose_level_ = 0;
+    enforce_memory_type_ = false;
+    requested_memory_type_ = TRITONSERVER_MEMORY_CPU;
+    model_is_loaded_ = false;
+    server_is_ready_ = false;
+    shm_manager_ = std::make_unique<SharedMemoryManager>();
+  }
+
+  Error PopulateInternals(
+      const std::string& triton_server_path,
+      const std::string& model_repository_path, bool verbose);
+
+  /// Load all tritonserver.h functions onto triton_loader
+  /// internal handles
+  Error LoadServerLibrary();
+
+  void ClearHandles();
+
+  /// Check if file exists in the current directory
+  /// \param filepath Path of library to check
+  /// \return perfanalyzer::clientbackend::Error
+  Error FileExists(std::string& filepath);
+
+  Error InitializeRequest(
+      const tc::InferOptions& options,
+      const std::vector<const tc::InferRequestedOutput*>& outputs,
+      TRITONSERVER_ResponseAllocator** allocator,
+      TRITONSERVER_InferenceRequest** irequest);
+
+  Error AddInputs(
+      const std::vector<tc::InferInput*>& inputs,
+      TRITONSERVER_InferenceRequest* irequest);
+
+  Error AddOutputs(
+      const std::vector<const tc::InferRequestedOutput*>& outputs,
+      TRITONSERVER_InferenceRequest* irequest);
+
+  void* dlhandle_;
+  TritonServerApiVersionFn_t api_version_fn_;
+  TritonServerOptionsNewFn_t options_new_fn_;
+  TritonServerOptionSetModelRepoPathFn_t options_set_model_repo_path_fn_;
+  TritonServerSetLogVerboseFn_t set_log_verbose_fn_;
+
+  TritonServerSetBackendDirFn_t set_backend_directory_fn_;
+  TritonServerSetRepoAgentDirFn_t set_repo_agent_directory_fn_;
+  TritonServerSetStrictModelConfigFn_t set_strict_model_config_fn_;
+  TritonServerSetMinSupportedComputeCapabilityFn_t
+      set_min_supported_compute_capability_fn_;
+
+  TritonServerNewFn_t server_new_fn_;
+  TritonServerOptionsDeleteFn_t server_options_delete_fn_;
+  TritonServerDeleteFn_t server_delete_fn_;
+  TritonServerIsLiveFn_t server_is_live_fn_;
+
+  TritonServerIsReadyFn_t server_is_ready_fn_;
+  TritonServerMetadataFn_t server_metadata_fn_;
+  TritonServerMessageSerializeToJsonFn_t message_serialize_to_json_fn_;
+  TritonServerMessageDeleteFn_t message_delete_fn_;
+
+  TritonServerModelIsReadyFn_t model_is_ready_fn_;
+  TritonServerModelMetadataFn_t model_metadata_fn_;
+  TritonServerResponseAllocatorNewFn_t response_allocator_new_fn_;
+  TritonServerInferenceRequestNewFn_t inference_request_new_fn_;
+
+  TritonServerInferenceRequestSetIdFn_t inference_request_set_id_fn_;
+  TritonServerInferenceRequestSetReleaseCallbackFn_t
+      inference_request_set_release_callback_fn_;
+  TritonServerInferenceRequestAddInputFn_t inference_request_add_input_fn_;
+  TritonServerInferenceRequestAddRequestedOutputFn_t
+      inference_request_add_requested_output_fn_;
+
+  TritonServerInferenceRequestAppendInputDataFn_t
+      inference_request_append_input_data_fn_;
+  TritonServerInferenceRequestSetResponseCallbackFn_t
+      inference_request_set_response_callback_fn_;
+  TritonServerInferAsyncFn_t infer_async_fn_;
+  TritonServerInferenceResponseErrorFn_t inference_response_error_fn_;
+
+  TritonServerInferenceResponseDeleteFn_t inference_response_delete_fn_;
+  TritonServerResponseAllocatorDeleteFn_t response_allocator_delete_fn_;
+  TritonServerErrorNewFn_t error_new_fn_;
+
+  TritonServerMemoryTypeStringFn_t memory_type_string_fn_;
+  TritonServerInferenceResponseOutputCountFn_t
+      inference_response_output_count_fn_;
+  TritonServerDataTypeStringFn_t data_type_string_fn_;
+  TritonServerErrorMessageFn_t error_message_fn_;
+
+  TritonServerErrorDeleteFn_t error_delete_fn_;
+  TritonServerErrorCodeToStringFn_t error_code_to_string_fn_;
+  TritonServerModelConfigFn_t model_config_fn_;
+  TritonServerInferenceRequestSetCorrelationIdFn_t set_correlation_id_fn_;
+  TritonServerInferenceRequestSetStringCorrelationIdFn_t
+      set_string_correlation_id_fn_;
+
+  TritonServerInferenceRequestSetFlagsFn_t set_flags_fn_;
+  TritonServerInferenceRequestSetPriorityFn_t set_priority_fn_;
+  TritonServerInferenceRequestSetTimeoutMicrosecondsFn_t set_timeout_ms_fn_;
+  TritonServerStringToDatatypeFn_t string_to_datatype_fn_;
+
+  TritonServerInferenceResponseOutputFn_t inference_response_output_fn_;
+  TritonServerRequestIdFn_t request_id_fn_;
+  TritonServerRequestDeleteFn_t request_delete_fn_;
+  TritonServerModelStatisticsFn_t model_statistics_fn_;
+
+  TritonSeverUnloadModelFn_t unload_model_fn_;
+  TritonSeverSetLogInfoFn_t set_log_info_fn_;
+  TritonServerSetCudaMemoryPoolByteSizeFn_t set_cuda_memory_pool_byte_size_;
+
+  std::shared_ptr<TRITONSERVER_Server> server_{nullptr};
+  std::string triton_server_path_{};
+  const std::string server_library_path_{"/lib/libtritonserver.so"};
+  int verbose_level_{0};
+  TRITONSERVER_MemoryType requested_memory_type_{TRITONSERVER_MEMORY_CPU};
+  bool enforce_memory_type_{false};
+  std::string model_repository_path_{""};
+  std::string model_name_{""};
+  int64_t model_version_{-1};
+  bool model_is_loaded_{false};
+  bool server_is_ready_{false};
+  std::unique_ptr<SharedMemoryManager> shm_manager_{nullptr};
+};
+
+}}}}  // namespace triton::perfanalyzer::clientbackend::tritoncapi
--- a/src/c++/perf_analyzer/command_line_parser.cc
+++ b/src/c++/perf_analyzer/command_line_parser.cc
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "command_line_parser.h"
+
+#include <getopt.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "perf_analyzer_exception.h"
+
+namespace triton { namespace perfanalyzer {
+
+PAParamsPtr
+CLParser::Parse(int argc, char** argv)
+{
+  ParseCommandLine(argc, argv);
+  VerifyOptions();
+
+  return params_;
+}
+
+// Used to format the usage message
+std::string
+CLParser::FormatMessage(std::string str, int offset) const
+{
+  int width = 60;
+  int current_pos = offset;
+  while (current_pos + width < int(str.length())) {
+    int n = str.rfind(' ', current_pos + width);
+    if (n != int(std::string::npos)) {
+      str.replace(n, 1, "\n\t ");
+      current_pos += (width + 10);
+    }
+  }
+  return str;
+}
+
+void
+CLParser::Usage(const std::string& msg)
+{
+  if (!msg.empty()) {
+    std::cerr << "Error: " << msg << std::endl;
+  }
+
+  std::cerr << "Usage: " << argv_[0] << " [options]" << std::endl;
+  std::cerr << "==== SYNOPSIS ====\n \n";
+  std::cerr << "\t--version " << std::endl;
+  std::cerr << "\t--service-kind "
+               "<\"triton\"|\"tfserving\"|\"torchserve\"|\"triton_c_api\">"
+            << std::endl;
+  std::cerr << "\t-m <model name>" << std::endl;
+  std::cerr << "\t-x <model version>" << std::endl;
+  std::cerr << "\t--bls-composing-models=<string>" << std::endl;
+  std::cerr << "\t--model-signature-name <model signature name>" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl;
+  std::cerr << "\t--async (-a)" << std::endl;
+  std::cerr << "\t--sync" << std::endl;
+  std::cerr << "\t--measurement-interval (-p) <measurement window (in msec)>"
+            << std::endl;
+  std::cerr << "\t--concurrency-range <start:end:step>" << std::endl;
+  std::cerr << "\t--request-rate-range <start:end:step>" << std::endl;
+  std::cerr << "\t--request-distribution <\"poisson\"|\"constant\">"
+            << std::endl;
+  std::cerr << "\t--request-intervals <path to file containing time intervals "
+               "in microseconds>"
+            << std::endl;
+  std::cerr << "\t--serial-sequences" << std::endl;
+  std::cerr << "\t--binary-search" << std::endl;
+  std::cerr << "\t--num-of-sequences <number of concurrent sequences>"
+            << std::endl;
+  std::cerr << "\t--latency-threshold (-l) <latency threshold (in msec)>"
+            << std::endl;
+  std::cerr << "\t--max-threads <thread counts>" << std::endl;
+  std::cerr << "\t--stability-percentage (-s) <deviation threshold for stable "
+               "measurement (in percentage)>"
+            << std::endl;
+  std::cerr << "\t--max-trials (-r)  <maximum number of measurements for each "
+               "profiling>"
+            << std::endl;
+  std::cerr << "\t--percentile <percentile>" << std::endl;
+  std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
+  std::cerr << "\t-t <number of concurrent requests>" << std::endl;
+  std::cerr << "\t-c <maximum concurrency>" << std::endl;
+  std::cerr << "\t-d" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "II. INPUT DATA OPTIONS: " << std::endl;
+  std::cerr << "\t-b <batch size>" << std::endl;
+  std::cerr << "\t--input-data <\"zero\"|\"random\"|<path>>" << std::endl;
+  std::cerr << "\t--shared-memory <\"system\"|\"cuda\"|\"none\">" << std::endl;
+  std::cerr << "\t--output-shared-memory-size <size in bytes>" << std::endl;
+  std::cerr << "\t--shape <name:shape>" << std::endl;
+  std::cerr << "\t--sequence-length <length>" << std::endl;
+  std::cerr << "\t--sequence-length-variation <variation>" << std::endl;
+  std::cerr << "\t--sequence-id-range <start:end>" << std::endl;
+  std::cerr << "\t--string-length <length>" << std::endl;
+  std::cerr << "\t--string-data <string>" << std::endl;
+  std::cerr << "\t--input-tensor-format=[binary|json]" << std::endl;
+  std::cerr << "\t--output-tensor-format=[binary|json]" << std::endl;
+  std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
+  std::cerr << "\t-z" << std::endl;
+  std::cerr << "\t--data-directory <path>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "III. SERVER DETAILS: " << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-i <Protocol used to communicate with inference service>"
+            << std::endl;
+  std::cerr << "\t--ssl-grpc-use-ssl <bool>" << std::endl;
+  std::cerr << "\t--ssl-grpc-root-certifications-file <path>" << std::endl;
+  std::cerr << "\t--ssl-grpc-private-key-file <path>" << std::endl;
+  std::cerr << "\t--ssl-grpc-certificate-chain-file <path>" << std::endl;
+  std::cerr << "\t--ssl-https-verify-peer <number>" << std::endl;
+  std::cerr << "\t--ssl-https-verify-host <number>" << std::endl;
+  std::cerr << "\t--ssl-https-ca-certificates-file <path>" << std::endl;
+  std::cerr << "\t--ssl-https-client-certificate-file <path>" << std::endl;
+  std::cerr << "\t--ssl-https-client-certificate-type <string>" << std::endl;
+  std::cerr << "\t--ssl-https-private-key-file <path>" << std::endl;
+  std::cerr << "\t--ssl-https-private-key-type <string>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "IV. OTHER OPTIONS: " << std::endl;
+  std::cerr << "\t-f <filename for storing report in csv format>" << std::endl;
+  std::cerr << "\t--profile-export-file <path>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << "\t--streaming" << std::endl;
+  std::cerr << "\t--grpc-compression-algorithm <compression_algorithm>"
+            << std::endl;
+  std::cerr << "\t--trace-file" << std::endl;
+  std::cerr << "\t--trace-level" << std::endl;
+  std::cerr << "\t--trace-rate" << std::endl;
+  std::cerr << "\t--trace-count" << std::endl;
+  std::cerr << "\t--log-frequency" << std::endl;
+  std::cerr << "\t--collect-metrics" << std::endl;
+  std::cerr << "\t--metrics-url" << std::endl;
+  std::cerr << "\t--metrics-interval" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "==== OPTIONS ==== \n \n";
+
+  std::cerr << FormatMessage(
+                   " --version: print the current version of Perf Analyzer.",
+                   18)
+            << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --service-kind: Describes the kind of service perf_analyzer to "
+             "generate load for. The options are \"triton\", \"triton_c_api\", "
+             "\"tfserving\" and \"torchserve\". Default value is \"triton\". "
+             "Note in order to use \"torchserve\" backend --input-data option "
+             "must point to a json file holding data in the following format "
+             "{\"data\" : [{\"TORCHSERVE_INPUT\" : [\"<complete path to the "
+             "content file>\"]}, {...}...]}. The type of file here will depend "
+             "on the model. In order to use \"triton_c_api\" you must specify "
+             "the Triton server install path and the model repository path via "
+             "the --triton-server-directory and --model-repository flags",
+             18)
+      << std::endl;
+
+  std::cerr
+      << std::setw(9) << std::left << " -m: "
+      << FormatMessage(
+             "This is a required argument and is used to specify the model"
+             " against which to run perf_analyzer.",
+             9)
+      << std::endl;
+  std::cerr << std::setw(9) << std::left << " -x: "
+            << FormatMessage(
+                   "The version of the above model to be used. If not specified"
+                   " the most recent version (that is, the highest numbered"
+                   " version) of the model will be used.",
+                   9)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --model-signature-name: The signature name of the saved "
+                   "model to use. Default value is \"serving_default\". This "
+                   "option will be ignored if --service-kind is not "
+                   "\"tfserving\".",
+                   18)
+            << std::endl;
+  std::cerr << std::setw(9) << std::left
+            << " -v: " << FormatMessage("Enables verbose mode.", 9)
+            << std::endl;
+  std::cerr << std::setw(9) << std::left
+            << " -v -v: " << FormatMessage("Enables extra verbose mode.", 9)
+            << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "I. MEASUREMENT PARAMETERS: " << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --async (-a): Enables asynchronous mode in perf_analyzer. "
+             "By default, perf_analyzer will use synchronous API to "
+             "request inference. However, if the model is sequential "
+             "then default mode is asynchronous. Specify --sync to "
+             "operate sequential models in synchronous mode. In synchronous "
+             "mode, perf_analyzer will start threads equal to the concurrency "
+             "level. Use asynchronous mode to limit the number of threads, yet "
+             "maintain the concurrency.",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --sync: Force enables synchronous mode in perf_analyzer. "
+                   "Can be used to operate perf_analyzer with sequential model "
+                   "in synchronous mode.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --measurement-interval (-p): Indicates the time interval used "
+             "for each measurement in milliseconds. The perf analyzer will "
+             "sample a time interval specified by -p and take measurement over "
+             "the requests completed within that time interval. The default "
+             "value is 5000 msec.",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --measurement-mode <\"time_windows\"|\"count_windows\">: "
+                   "Indicates the mode used for stabilizing measurements."
+                   " \"time_windows\" will create windows such that the length "
+                   "of each window is equal to --measurement-interval. "
+                   "\"count_windows\" will create "
+                   "windows such that there are at least "
+                   "--measurement-request-count requests in each window.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --measurement-request-count: "
+             "Indicates the minimum number of requests to be collected in each "
+             "measurement window when \"count_windows\" mode is used. This "
+             "mode can "
+             "be enabled using the --measurement-mode flag.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --concurrency-range <start:end:step>: Determines the range of "
+             "concurrency levels covered by the perf_analyzer. The "
+             "perf_analyzer "
+             "will start from the concurrency level of 'start' and go till "
+             "'end' with a stride of 'step'. The default value of 'end' and "
+             "'step' are 1. If 'end' is not specified then perf_analyzer will "
+             "run for a single concurrency level determined by 'start'. If "
+             "'end' is set as 0, then the concurrency limit will be "
+             "incremented by 'step' till latency threshold is met. 'end' and "
+             "--latency-threshold can not be both 0 simultaneously. 'end' can "
+             "not be 0 for sequence models while using asynchronous mode.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --request-rate-range <start:end:step>: Determines the range of "
+             "request rates for load generated by analyzer. This option can "
+             "take floating-point values. The search along the request rate "
+             "range is enabled only when using this option. If not specified, "
+             "then analyzer will search along the concurrency-range. The "
+             "perf_analyzer will start from the request rate of 'start' and go "
+             "till 'end' with a stride of 'step'. The default values of "
+             "'start', 'end' and 'step' are all 1.0. If 'end' is not specified "
+             "then perf_analyzer will run for a single request rate as "
+             "determined by 'start'. If 'end' is set as 0.0, then the request "
+             "rate will be incremented by 'step' till latency threshold is "
+             "met. 'end' and --latency-threshold can not be both 0 "
+             "simultaneously.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --request-distribution <\"poisson\"|\"constant\">: Specifies "
+             "the time interval distribution between dispatching inference "
+             "requests to the server. Poisson distribution closely mimics the "
+             "real-world work load on a server. This option is ignored if not "
+             "using --request-rate-range. By default, this option is set to be "
+             "constant.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --request-intervals: Specifies a path to a file containing time "
+             "intervals in microseconds. Each time interval should be in a new "
+             "line. The analyzer will try to maintain time intervals between "
+             "successive generated requests to be as close as possible in this "
+             "file. This option can be used to apply custom load to server "
+             "with a certain pattern of interest. The analyzer will loop "
+             "around the file if the duration of execution exceeds to that "
+             "accounted for by the intervals. This option can not be used with "
+             "--request-rate-range or --concurrency-range.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             "--binary-search: Enables the binary search on the specified "
+             "search range. This option requires 'start' and 'end' to be "
+             "expilicitly specified in the --concurrency-range or "
+             "--request-rate-range. When using this option, 'step' is more "
+             "like the precision. Lower the 'step', more the number of "
+             "iterations along the search path to find suitable convergence. "
+             "By default, linear search is used.",
+             18)
+      << std::endl;
+
+  std::cerr << FormatMessage(
+                   "--num-of-sequences: Sets the number of concurrent "
+                   "sequences for sequence models. This option is ignored when "
+                   "--request-rate-range is not specified. By default, its "
+                   "value is 4.",
+                   18)
+            << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --latency-threshold (-l): Sets the limit on the observed "
+             "latency. Analyzer will terminate the concurrency search once "
+             "the measured latency exceeds this threshold. By default, "
+             "latency threshold is set 0 and the perf_analyzer will run "
+             "for entire --concurrency-range.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --max-threads: Sets the maximum number of threads that will be "
+             "created for providing desired concurrency or request rate. "
+             "However, when running"
+             "in synchronous mode with concurrency-range having explicit 'end' "
+             "specification,"
+             "this value will be ignored. Default is 4 if --request-rate-range "
+             "is specified otherwise default is 16.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --stability-percentage (-s): Indicates the allowed variation in "
+             "latency measurements when determining if a result is stable. The "
+             "measurement is considered as stable if the ratio of max / min "
+             "from the recent 3 measurements is within (stability percentage)% "
+             "in terms of both infer per second and latency. Default is "
+             "10(%).",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --max-trials (-r): Indicates the maximum number of "
+                   "measurements for each concurrency level visited during "
+                   "search. The perf analyzer will take multiple measurements "
+                   "and report the measurement until it is stable. The perf "
+                   "analyzer will abort if the measurement is still unstable "
+                   "after the maximum number of measurements. The default "
+                   "value is 10.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --percentile: Indicates the confidence value as a percentile "
+             "that will be used to determine if a measurement is stable. For "
+             "example, a value of 85 indicates that the 85th percentile "
+             "latency will be used to determine stability. The percentile will "
+             "also be reported in the results. The default is -1 indicating "
+             "that the average latency is used to determine stability",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --serial-sequences: Enables serial sequence mode "
+                   "where a maximum of one request is outstanding at a time "
+                   "for any given sequence. The default is false.",
+                   18)
+            << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "II. INPUT DATA OPTIONS: " << std::endl;
+  std::cerr << std::setw(9) << std::left
+            << " -b: " << FormatMessage("Batch size for each request sent.", 9)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --input-data: Select the type of data that will be used "
+             "for input in inference requests. The available options are "
+             "\"zero\", \"random\", path to a directory or a json file. If the "
+             "option is path to a directory then the directory must "
+             "contain a binary/text file for each non-string/string input "
+             "respectively, named the same as the input. Each "
+             "file must contain the data required for that input for a batch-1 "
+             "request. Each binary file should contain the raw binary "
+             "representation of the input in row-major order for non-string "
+             "inputs. The text file should contain all strings needed by "
+             "batch-1, each in a new line, listed in row-major order. When "
+             "pointing to a json file, user must adhere to the format "
+             "described in the Performance Analyzer documentation. By "
+             "specifying json data users can control data used with every "
+             "request. Multiple data streams can be specified for a sequence "
+             "model and the analyzer will select a data stream in a "
+             "round-robin fashion for every new sequence. Multiple json files "
+             "can also be provided (--input-data json_file1 --input-data "
+             "json-file2 and so on) and the analyzer will append data streams "
+             "from each file. When using --service-kind=torchserve make sure "
+             "this option points to a json file. Default is \"random\".",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --shared-memory <\"system\"|\"cuda\"|\"none\">: Specifies "
+                   "the type of the shared memory to use for input and output "
+                   "data. Default is none.",
+                   18)
+            << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --output-shared-memory-size: The size in bytes of the shared "
+             "memory region to allocate per output tensor. Only needed when "
+             "one or more of the outputs are of string type and/or variable "
+             "shape. The value should be larger than the size of the largest "
+             "output tensor the model is expected to return. The analyzer will "
+             "use the following formula to calculate the total shared memory "
+             "to allocate: output_shared_memory_size * number_of_outputs * "
+             "batch_size. Defaults to 100KB.",
+             18)
+      << std::endl;
+
+  std::cerr << FormatMessage(
+                   " --shape: The shape used for the specified input. The "
+                   "argument must be specified as 'name:shape' where the shape "
+                   "is a comma-separated list for dimension sizes, for example "
+                   "'--shape input_name:1,2,3' indicate tensor shape [ 1, 2, 3 "
+                   "]. --shape may be specified multiple times to specify "
+                   "shapes for different inputs.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --sequence-length: Indicates the base length of a "
+                   "sequence used for sequence models. A sequence with length "
+                   "X will be composed of X requests to be sent as the "
+                   "elements in the sequence. The actual length of the sequence"
+                   "will be within +/- Y% of the base length, where Y defaults "
+                   "to 20% and is customizable via "
+                   "`--sequence-length-variation`. If sequence length is "
+                   "unspecified and input data is provided, the sequence "
+                   "length will be the number of inputs in the user-provided "
+                   "input data. Default is 20.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --sequence-length-variation: The percentage variation in "
+                   "length of sequences. This flag is only valid when "
+                   "not using user-provided input data or when "
+                   "`--sequence-length` is specified while using user-provided "
+                   "input data. Default is 20.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --sequence-id-range <start:end>: Determines the range of "
+             "sequence id used by the perf_analyzer. The perf_analyzer "
+             "will start from the sequence id of 'start' and go till "
+             "'end' (excluded). If 'end' is not specified then perf_analyzer "
+             "will use new sequence id without bounds. If 'end' is specified "
+             "and the concurrency setting may result in maintaining a number "
+             "of sequences more than the range of available sequence id, "
+             "perf analyzer will exit with error due to possible sequence id "
+             "collision. The default setting is start from sequence id 1 and "
+             "without bounds",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --string-length: Specifies the length of the random "
+                   "strings to be generated by the analyzer for string input. "
+                   "This option is ignored if --input-data points to a "
+                   "directory. Default is 128.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --string-data: If provided, analyzer will use this string "
+                   "to initialize string input buffers. The perf analyzer will "
+                   "replicate the given string to build tensors of required "
+                   "shape. --string-length will not have any effect. This "
+                   "option is ignored if --input-data points to a directory.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --input-tensor-format=[binary|json]: Specifies Triton "
+                   "inference request input tensor format. Only valid when "
+                   "HTTP protocol is used. Default is 'binary'.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --output-tensor-format=[binary|json]: Specifies Triton "
+                   "inference response output tensor format. Only valid when "
+                   "HTTP protocol is used. Default is 'binary'.",
+                   18)
+            << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "III. SERVER DETAILS: " << std::endl;
+  std::cerr << std::setw(38) << std::left << " -u: "
+            << FormatMessage(
+                   "Specify URL to the server. When using triton default is "
+                   "\"localhost:8000\" if using HTTP and \"localhost:8001\" "
+                   "if using gRPC. When using tfserving default is "
+                   "\"localhost:8500\". ",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " -i: "
+            << FormatMessage(
+                   "The communication protocol to use. The available protocols "
+                   "are gRPC and HTTP. Default is HTTP.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " --ssl-grpc-use-ssl: "
+            << FormatMessage(
+                   "Bool (true|false) for whether "
+                   "to use encrypted channel to the server. Default false.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left
+            << " --ssl-grpc-root-certifications-file: "
+            << FormatMessage(
+                   "Path to file containing the "
+                   "PEM encoding of the server root certificates.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " --ssl-grpc-private-key-file: "
+            << FormatMessage(
+                   "Path to file containing the "
+                   "PEM encoding of the client's private key.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left
+            << " --ssl-grpc-certificate-chain-file: "
+            << FormatMessage(
+                   "Path to file containing the "
+                   "PEM encoding of the client's certificate chain.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " --ssl-https-verify-peer: "
+            << FormatMessage(
+                   "Number (0|1) to verify the "
+                   "peer's SSL certificate. See "
+                   "https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html for "
+                   "the meaning of each value. Default is 1.",
+                   38)
+            << std::endl;
+  std::cerr
+      << std::setw(38) << std::left << " --ssl-https-verify-host: "
+      << FormatMessage(
+             "Number (0|1|2) to verify the "
+             "certificate's name against host. "
+             "See https://curl.se/libcurl/c/CURLOPT_SSL_VERIFYHOST.html for "
+             "the meaning of each value. Default is 2.",
+             38)
+      << std::endl;
+  std::cerr << std::setw(38) << std::left
+            << " --ssl-https-ca-certificates-file: "
+            << FormatMessage(
+                   "Path to Certificate Authority "
+                   "(CA) bundle.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left
+            << " --ssl-https-client-certificate-file: "
+            << FormatMessage("Path to the SSL client certificate.", 38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left
+            << " --ssl-https-client-certificate-type: "
+            << FormatMessage(
+                   "Type (PEM|DER) of the client "
+                   "SSL certificate. Default is PEM.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " --ssl-https-private-key-file: "
+            << FormatMessage(
+                   "Path to the private keyfile "
+                   "for TLS and SSL client cert.",
+                   38)
+            << std::endl;
+  std::cerr << std::setw(38) << std::left << " --ssl-https-private-key-type: "
+            << FormatMessage(
+                   "Type (PEM|DER) of the private "
+                   "key file. Default is PEM.",
+                   38)
+            << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "IV. OTHER OPTIONS: " << std::endl;
+  std::cerr
+      << std::setw(9) << std::left << " -f: "
+      << FormatMessage(
+             "The latency report will be stored in the file named by "
+             "this option. By default, the result is not recorded in a file.",
+             9)
+      << std::endl;
+  std::cerr << std::setw(9) << std::left << " --profile-export-file: "
+            << FormatMessage(
+                   "Specifies the path that the profile export will be "
+                   "generated at. By default, the profile export will not be "
+                   "generated.",
+                   9)
+            << std::endl;
+  std::cerr
+      << std::setw(9) << std::left << " -H: "
+      << FormatMessage(
+             "The header will be added to HTTP requests (ignored for GRPC "
+             "requests). The header must be specified as 'Header:Value'. -H "
+             "may be specified multiple times to add multiple headers.",
+             9)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --streaming: Enables the use of streaming API. This flag is "
+             "only valid with gRPC protocol. By default, it is set false.",
+             18)
+      << std::endl;
+
+  std::cerr << FormatMessage(
+                   " --grpc-compression-algorithm: The compression algorithm "
+                   "to be used by gRPC when sending request. Only supported "
+                   "when grpc protocol is being used. The supported values are "
+                   "none, gzip, and deflate. Default value is none.",
+                   18)
+            << std::endl;
+
+  std::cerr
+      << FormatMessage(
+             " --trace-file: Set the file where trace output will be saved."
+             " If --log-frequency is also specified, this argument "
+             "value will be the prefix of the files to save the trace "
+             "output. See --log-frequency for details. Only used for "
+             "service-kind of triton. Default value is none.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --trace-level: Specify a trace level. OFF to disable tracing, "
+             "TIMESTAMPS to trace timestamps, TENSORS to trace tensors. It "
+             "may be specified multiple times to trace multiple "
+             "information. Default is OFF.",
+             18)
+      << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --trace-rate: Set the trace sampling rate. Default is 1000.", 18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --trace-count: Set the number of traces to be sampled. "
+                   "If the value is -1, the number of traces to be sampled "
+                   "will not be limited. Default is -1.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --log-frequency:  Set the trace log frequency. If the "
+             "value is 0, Triton will only log the trace output to "
+             "<trace-file> when shutting down. Otherwise, Triton will log "
+             "the trace output to <trace-file>.<idx> when it collects the "
+             "specified number of traces. For example, if the log frequency "
+             "is 100, when Triton collects the 100-th trace, it logs the "
+             "traces to file <trace-file>.0, and when it collects the 200-th "
+             "trace, it logs the 101-th to the 200-th traces to file "
+             "<trace-file>.1. Default is 0.",
+             18)
+      << std::endl;
+
+  std::cerr << FormatMessage(
+                   " --triton-server-directory: The Triton server install "
+                   "path. Required by and only used when C API "
+                   "is used (--service-kind=triton_c_api). "
+                   "eg:--triton-server-directory=/opt/tritonserver.",
+                   18)
+            << std::endl;
+  std::cerr
+      << FormatMessage(
+             " --model-repository: The model repository of which the model is "
+             "loaded. Required by and only used when C API is used "
+             "(--service-kind=triton_c_api). "
+             "eg:--model-repository=/tmp/host/docker-data/model_unit_test.",
+             18)
+      << std::endl;
+  std::cerr << FormatMessage(
+                   " --verbose-csv: The csv files generated by perf analyzer "
+                   "will include additional information.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --collect-metrics: Enables collection of server-side "
+                   "inference server metrics. Outputs metrics in the csv file "
+                   "generated with the -f option. Must enable `--verbose-csv` "
+                   "option to use the `--collect-metrics`.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --metrics-url: The URL to query for server-side inference "
+                   "server metrics. Default is 'localhost:8002/metrics'.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --metrics-interval: How often in milliseconds, within "
+                   "each measurement window, to query for server-side "
+                   "inference server metrics. Default is 1000.",
+                   18)
+            << std::endl;
+  std::cerr << FormatMessage(
+                   " --bls-composing-models: A comma separated list of all "
+                   "BLS composing models (with optional model version number "
+                   "after a colon for each) that may be called by the input "
+                   "BLS model. For example, 'modelA:3,modelB' would specify "
+                   "that modelA and modelB are composing models that may be "
+                   "called by the input BLS model, and that modelA will use "
+                   "version 3, while modelB's version is unspecified",
+                   18)
+            << std::endl;
+  exit(GENERIC_ERROR);
+}
+
+void
+CLParser::PrintVersion()
+{
+  std::cerr << "Perf Analyzer Version " << VERSION << " (commit " << SHA << ")"
+            << std::endl;
+  exit(SUCCESS);
+}
+
+void
+CLParser::ParseCommandLine(int argc, char** argv)
+{
+  argc_ = argc;
+  argv_ = argv;
+
+  // {name, has_arg, *flag, val}
+  static struct option long_options[] = {
+      {"streaming", no_argument, 0, 0},
+      {"max-threads", required_argument, 0, 1},
+      {"sequence-length", required_argument, 0, 2},
+      {"percentile", required_argument, 0, 3},
+      {"data-directory", required_argument, 0, 4},
+      {"shape", required_argument, 0, 5},
+      {"measurement-interval", required_argument, 0, 6},
+      {"concurrency-range", required_argument, 0, 7},
+      {"latency-threshold", required_argument, 0, 8},
+      {"stability-percentage", required_argument, 0, 9},
+      {"max-trials", required_argument, 0, 10},
+      {"input-data", required_argument, 0, 11},
+      {"string-length", required_argument, 0, 12},
+      {"string-data", required_argument, 0, 13},
+      {"async", no_argument, 0, 14},
+      {"sync", no_argument, 0, 15},
+      {"request-rate-range", required_argument, 0, 16},
+      {"num-of-sequences", required_argument, 0, 17},
+      {"binary-search", no_argument, 0, 18},
+      {"request-distribution", required_argument, 0, 19},
+      {"request-intervals", required_argument, 0, 20},
+      {"shared-memory", required_argument, 0, 21},
+      {"output-shared-memory-size", required_argument, 0, 22},
+      {"service-kind", required_argument, 0, 23},
+      {"model-signature-name", required_argument, 0, 24},
+      {"grpc-compression-algorithm", required_argument, 0, 25},
+      {"measurement-mode", required_argument, 0, 26},
+      {"measurement-request-count", required_argument, 0, 27},
+      {"triton-server-directory", required_argument, 0, 28},
+      {"model-repository", required_argument, 0, 29},
+      {"sequence-id-range", required_argument, 0, 30},
+      {"ssl-grpc-use-ssl", no_argument, 0, 31},
+      {"ssl-grpc-root-certifications-file", required_argument, 0, 32},
+      {"ssl-grpc-private-key-file", required_argument, 0, 33},
+      {"ssl-grpc-certificate-chain-file", required_argument, 0, 34},
+      {"ssl-https-verify-peer", required_argument, 0, 35},
+      {"ssl-https-verify-host", required_argument, 0, 36},
+      {"ssl-https-ca-certificates-file", required_argument, 0, 37},
+      {"ssl-https-client-certificate-file", required_argument, 0, 38},
+      {"ssl-https-client-certificate-type", required_argument, 0, 39},
+      {"ssl-https-private-key-file", required_argument, 0, 40},
+      {"ssl-https-private-key-type", required_argument, 0, 41},
+      {"verbose-csv", no_argument, 0, 42},
+      {"enable-mpi", no_argument, 0, 43},
+      {"trace-file", required_argument, 0, 44},
+      {"trace-level", required_argument, 0, 45},
+      {"trace-rate", required_argument, 0, 46},
+      {"trace-count", required_argument, 0, 47},
+      {"log-frequency", required_argument, 0, 48},
+      {"collect-metrics", no_argument, 0, 49},
+      {"metrics-url", required_argument, 0, 50},
+      {"metrics-interval", required_argument, 0, 51},
+      {"sequence-length-variation", required_argument, 0, 52},
+      {"bls-composing-models", required_argument, 0, 53},
+      {"serial-sequences", no_argument, 0, 54},
+      {"input-tensor-format", required_argument, 0, 55},
+      {"output-tensor-format", required_argument, 0, 56},
+      {"version", no_argument, 0, 57},
+      {"profile-export-file", required_argument, 0, 58},
+      {0, 0, 0, 0}};
+
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt_long(
+              argc, argv, "vdazc:u:m:x:b:t:p:i:H:l:r:s:f:", long_options,
+              NULL)) != -1) {
+    try {
+      switch (opt) {
+        case 0:
+          params_->streaming = true;
+          break;
+        case 1: {
+          std::string max_threads{optarg};
+          if (std::stoi(max_threads) > 0) {
+            params_->max_threads = std::stoull(max_threads);
+            params_->max_threads_specified = true;
+          } else {
+            Usage("Failed to parse --max-threads. The value must be > 0.");
+          }
+          break;
+        }
+        case 2: {
+          std::string sequence_length{optarg};
+          if (std::stoi(sequence_length) > 0) {
+            params_->sequence_length = std::stoull(sequence_length);
+          } else {
+            std::cerr << "WARNING: The sequence length must be > 0. Perf "
+                         "Analyzer will use default value if it is measuring "
+                         "on sequence model."
+                      << std::endl;
+          }
+          params_->sequence_length_specified = true;
+          break;
+        }
+        case 3:
+          params_->percentile = std::atoi(optarg);
+          break;
+        case 4:
+          params_->user_data.push_back(optarg);
+          break;
+        case 5: {
+          std::string arg = optarg;
+          auto colon_pos = arg.rfind(":");
+          if (colon_pos == std::string::npos) {
+            Usage(
+                "Failed to parse --shape. There must be a colon after input "
+                "name.");
+          }
+          std::string name = arg.substr(0, colon_pos);
+          std::string shape_str = arg.substr(name.size() + 1);
+          size_t pos = 0;
+          std::vector<int64_t> shape;
+          while (pos != std::string::npos) {
+            size_t comma_pos = shape_str.find(",", pos);
+            int64_t dim;
+            if (comma_pos == std::string::npos) {
+              dim = std::stoll(shape_str.substr(pos, comma_pos));
+              pos = comma_pos;
+            } else {
+              dim = std::stoll(shape_str.substr(pos, comma_pos - pos));
+              pos = comma_pos + 1;
+            }
+            if (dim <= 0) {
+              Usage(
+                  "Failed to parse --shape. The dimensions of input tensor "
+                  "must be > 0.");
+            }
+            shape.emplace_back(dim);
+          }
+
+          params_->input_shapes[name] = shape;
+          break;
+        }
+        case 6:
+        case 'p': {
+          std::string measurement_window_ms{optarg};
+          if (std::stoi(measurement_window_ms) > 0) {
+            params_->measurement_window_ms = std::stoull(measurement_window_ms);
+          } else {
+            Usage(
+                "Failed to parse --measurement-interval (-p). The value must "
+                "be > 0 msec.");
+          }
+          break;
+        }
+        case 7: {
+          params_->using_concurrency_range = true;
+          std::string arg = optarg;
+          size_t pos = 0;
+          int index = 0;
+          while (pos != std::string::npos) {
+            size_t colon_pos = arg.find(":", pos);
+            if (index > 2) {
+              Usage(
+                  "Failed to parse --concurrency-range. The value does not "
+                  "match <start:end:step>.");
+            }
+            int64_t val;
+            if (colon_pos == std::string::npos) {
+              val = std::stoull(arg.substr(pos, colon_pos));
+              pos = colon_pos;
+            } else {
+              val = std::stoull(arg.substr(pos, colon_pos - pos));
+              pos = colon_pos + 1;
+            }
+            switch (index) {
+              case 0:
+                params_->concurrency_range.start = val;
+                break;
+              case 1:
+                params_->concurrency_range.end = val;
+                break;
+              case 2:
+                params_->concurrency_range.step = val;
+                break;
+            }
+            index++;
+          }
+
+          break;
+        }
+        case 8:
+        case 'l': {
+          std::string latency_threshold_ms{optarg};
+          if (std::stoi(latency_threshold_ms) == 0) {
+            params_->latency_threshold_ms = NO_LIMIT;
+          } else if (std::stoi(latency_threshold_ms) > 0) {
+            params_->latency_threshold_ms = std::stoull(latency_threshold_ms);
+          } else {
+            Usage(
+                "Failed to parse --latency-threshold (-l). The value must be "
+                ">= 0 msecs.");
+          }
+          break;
+        }
+        case 9:
+        case 's': {
+          std::string stability_threshold{optarg};
+          if (std::stof(stability_threshold) >= 0.0) {
+            params_->stability_threshold = std::stof(optarg) / 100;
+          } else {
+            Usage(
+                "Failed to parse --stability-percentage (-s). The value must "
+                "be >= 0.0.");
+          }
+          break;
+        }
+        case 10:
+        case 'r': {
+          std::string max_trials{optarg};
+          if (std::stoi(max_trials) > 0) {
+            params_->max_trials = std::stoull(max_trials);
+          } else {
+            Usage("Failed to parse --max-trials (-r). The value must be > 0.");
+          }
+          break;
+        }
+        case 11: {
+          std::string arg = optarg;
+          // Check whether the argument is a directory
+          if (IsDirectory(arg) || IsFile(arg)) {
+            params_->user_data.push_back(optarg);
+          } else if (arg.compare("zero") == 0) {
+            params_->zero_input = true;
+          } else if (arg.compare("random") == 0) {
+            break;
+          } else {
+            Usage(
+                "Failed to parse --input-data. Unsupported type provided: '" +
+                std::string(optarg) +
+                "'. The available options are 'zero', 'random', path to a "
+                "directory, or a json file.");
+          }
+          break;
+        }
+        case 12: {
+          std::string string_length{optarg};
+          if (std::stoi(string_length) > 0) {
+            params_->string_length = std::stoull(string_length);
+          } else {
+            Usage("Failed to parse --string-length. The value must be > 0");
+          }
+          break;
+        }
+        case 13: {
+          params_->string_data = optarg;
+          break;
+        }
+        case 14:
+        case 'a': {
+          params_->async = true;
+          break;
+        }
+        case 15: {
+          params_->forced_sync = true;
+          break;
+        }
+        case 16: {
+          params_->using_request_rate_range = true;
+          std::string arg = optarg;
+          size_t pos = 0;
+          int index = 0;
+          while (pos != std::string::npos) {
+            size_t colon_pos = arg.find(":", pos);
+            if (index > 2) {
+              Usage(
+                  "Failed to parse --request-rate-range. The value does not "
+                  "match <start:end:step>.");
+            }
+            if (colon_pos == std::string::npos) {
+              params_->request_rate_range[index] =
+                  std::stod(arg.substr(pos, colon_pos));
+              pos = colon_pos;
+            } else {
+              params_->request_rate_range[index] =
+                  std::stod(arg.substr(pos, colon_pos - pos));
+              pos = colon_pos + 1;
+              index++;
+            }
+          }
+
+          break;
+        }
+        case 17: {
+          std::string num_of_sequences{optarg};
+          if (std::stoi(num_of_sequences) > 0) {
+            params_->num_of_sequences = std::stoul(num_of_sequences);
+          } else {
+            Usage("Failed to parse --num-of-sequences. The value must be > 0.");
+          }
+          break;
+        }
+        case 18: {
+          params_->search_mode = SearchMode::BINARY;
+          break;
+        }
+        case 19: {
+          std::string arg = optarg;
+          if (arg.compare("poisson") == 0) {
+            params_->request_distribution = Distribution::POISSON;
+          } else if (arg.compare("constant") == 0) {
+            params_->request_distribution = Distribution::CONSTANT;
+          } else {
+            Usage(
+                "Failed to parse --request-distribution. Unsupported type "
+                "provided: '" +
+                std::string(optarg) + "'. Choices are 'posson' or 'constant'.");
+          }
+          break;
+        }
+        case 20: {
+          std::string request_intervals_file{optarg};
+          if (IsFile(request_intervals_file)) {
+            params_->request_intervals_file = request_intervals_file;
+            params_->using_custom_intervals = true;
+          } else {
+            Usage(
+                "Failed to parse --request-intervals. The value must be a "
+                "valid file path");
+          }
+          break;
+        }
+        case 21: {
+          std::string arg = optarg;
+          if (arg.compare("system") == 0) {
+            params_->shared_memory_type =
+                SharedMemoryType::SYSTEM_SHARED_MEMORY;
+          } else if (arg.compare("cuda") == 0) {
+#ifdef TRITON_ENABLE_GPU
+            params_->shared_memory_type = SharedMemoryType::CUDA_SHARED_MEMORY;
+#else
+            Usage(
+                "Cuda shared memory is not supported when "
+                "TRITON_ENABLE_GPU=0.");
+#endif  // TRITON_ENABLE_GPU
+          } else if (arg.compare("none") == 0) {
+            params_->shared_memory_type = SharedMemoryType::NO_SHARED_MEMORY;
+          } else {
+            Usage(
+                "Failed to parse --shared-memory. Unsupported type provided: "
+                "'" +
+                std::string(optarg) +
+                "'. The available options are 'system', 'cuda', or 'none'.");
+          }
+          break;
+        }
+        case 22: {
+          std::string output_shm_size{optarg};
+          if (std::stoi(output_shm_size) >= 0) {
+            params_->output_shm_size = std::stoull(output_shm_size);
+          } else {
+            Usage(
+                "Failed to parse --output-shared-memory-size. The value must "
+                "be >= 0.");
+          }
+          break;
+        }
+        case 23: {
+          std::string arg = optarg;
+          if (arg.compare("triton") == 0) {
+            params_->kind = cb::TRITON;
+          } else if (arg.compare("tfserving") == 0) {
+            params_->kind = cb::TENSORFLOW_SERVING;
+          } else if (arg.compare("torchserve") == 0) {
+            params_->kind = cb::TORCHSERVE;
+          } else if (arg.compare("triton_c_api") == 0) {
+            params_->kind = cb::TRITON_C_API;
+          } else {
+            Usage(
+                "Failed to parse --service-kind. Unsupported type provided: '" +
+                std::string{optarg} +
+                "'. The available options are 'triton', 'tfserving', "
+                "'torchserve', or 'triton_c_api'.");
+          }
+          break;
+        }
+        case 24:
+          params_->model_signature_name = optarg;
+          break;
+        case 25: {
+          std::string arg = optarg;
+          if (arg.compare("none") == 0) {
+            params_->compression_algorithm = cb::COMPRESS_NONE;
+          } else if (arg.compare("deflate") == 0) {
+            params_->compression_algorithm = cb::COMPRESS_DEFLATE;
+          } else if (arg.compare("gzip") == 0) {
+            params_->compression_algorithm = cb::COMPRESS_GZIP;
+          } else {
+            Usage(
+                "Failed to parse --grpc-compression-algorithm. Unsupported "
+                "type provided: '" +
+                arg +
+                "'. The available options are 'gzip', 'deflate', or 'none'.");
+          }
+          params_->using_grpc_compression = true;
+          break;
+        }
+        case 26: {
+          std::string arg = optarg;
+          if (arg.compare("time_windows") == 0) {
+            params_->measurement_mode = MeasurementMode::TIME_WINDOWS;
+          } else if (arg.compare("count_windows") == 0) {
+            params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
+          } else {
+            Usage(
+                "Failed to parse --measurement-mode. Unsupported type "
+                "provided: '" +
+                arg +
+                "'. The available options are 'time_windows' or "
+                "'count_windows'.");
+          }
+          break;
+        }
+        case 27: {
+          std::string request_count{optarg};
+          if (std::stoi(request_count) > 0) {
+            params_->measurement_request_count = std::stoull(request_count);
+          } else {
+            Usage(
+                "Failed to parse --measurement-request-count. The value must "
+                "be > 0.");
+          }
+          break;
+        }
+        case 28: {
+          params_->triton_server_path = optarg;
+          break;
+        }
+        case 29: {
+          params_->model_repository_path = optarg;
+          break;
+        }
+        case 30: {
+          std::string arg = optarg;
+          int64_t start_id;
+          int64_t end_id;
+          size_t pos = 0;
+          int index = 0;
+          while (pos != std::string::npos) {
+            size_t colon_pos = arg.find(":", pos);
+            if (index > 1) {
+              Usage(
+                  "Failed to parse --sequence-id-range. The value does not "
+                  "match <start:end>.");
+            }
+            if (colon_pos == std::string::npos) {
+              std::string sequence_id{arg.substr(pos, colon_pos)};
+              if (index == 0) {
+                start_id = std::stoi(sequence_id);
+              } else {
+                end_id = std::stoi(sequence_id);
+              }
+              pos = colon_pos;
+            } else {
+              std::string sequence_id{arg.substr(pos, colon_pos - pos)};
+              start_id = std::stoi(sequence_id);
+              pos = colon_pos + 1;
+              index++;
+            }
+          }
+
+          // Check for invalid inputs
+          if (start_id < 0 || end_id < 0) {
+            Usage(
+                "Failed to parse --sequence-id-range. The range values must be "
+                ">= 0.");
+          } else if (start_id > end_id) {
+            Usage(
+                "Failed to parse --sequence-id-range. The 'end' value must be "
+                "greater than 'start' value.");
+          }
+
+          if (index == 0) {  // Only start ID is given
+            params_->start_sequence_id = start_id;
+          } else {
+            params_->start_sequence_id = start_id;
+            params_->sequence_id_range = end_id - start_id;
+          }
+          break;
+        }
+        case 31: {
+          params_->ssl_options.ssl_grpc_use_ssl = true;
+          break;
+        }
+        case 32: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_grpc_root_certifications_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-grpc-root-certifications-file. The "
+                "value must be a valid file path.");
+          }
+          break;
+        }
+        case 33: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_grpc_private_key_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-grpc-private-key-file. The value must "
+                "be a valid file path.");
+          }
+          break;
+        }
+        case 34: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_grpc_certificate_chain_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-grpc-certificate-chain-file. The value "
+                "must be a valid file path.");
+          }
+          break;
+        }
+        case 35: {
+          if (std::atol(optarg) == 0 || std::atol(optarg) == 1) {
+            params_->ssl_options.ssl_https_verify_peer = std::atol(optarg);
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-verify-peer. The value must be "
+                "either 0 or 1.");
+          }
+          break;
+        }
+        case 36: {
+          if (std::atol(optarg) == 0 || std::atol(optarg) == 1 ||
+              std::atol(optarg) == 2) {
+            params_->ssl_options.ssl_https_verify_host = std::atol(optarg);
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-verify-host. The value must be "
+                "either 0, 1, or 2.");
+          }
+          break;
+        }
+        case 37: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_https_ca_certificates_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-ca-certificates-file. The value "
+                "must be a valid file path.");
+          }
+          break;
+        }
+        case 38: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_https_client_certificate_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-client-certificate-file. The "
+                "value must be a valid file path.");
+          }
+          break;
+        }
+        case 39: {
+          if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") {
+            params_->ssl_options.ssl_https_client_certificate_type = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-client-certificate-type. "
+                "Unsupported type provided: '" +
+                std::string{optarg} +
+                "'. The available options are 'PEM' or 'DER'.");
+          }
+          break;
+        }
+        case 40: {
+          if (IsFile(optarg)) {
+            params_->ssl_options.ssl_https_private_key_file = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-private-key-file. The value must "
+                "be a valid file path.");
+          }
+          break;
+        }
+        case 41: {
+          if (std::string(optarg) == "PEM" || std::string(optarg) == "DER") {
+            params_->ssl_options.ssl_https_private_key_type = optarg;
+          } else {
+            Usage(
+                "Failed to parse --ssl-https-private-key-type. Unsupported "
+                "type provided: '" +
+                std::string{optarg} +
+                "'. The available options are 'PEM' or 'DER'.");
+          }
+          break;
+        }
+        case 42: {
+          params_->verbose_csv = true;
+          break;
+        }
+        case 43: {
+          params_->enable_mpi = true;
+          break;
+        }
+        case 44: {
+          params_->trace_options["trace_file"] = {optarg};
+          break;
+        }
+        case 45: {
+          std::string trace_level{optarg};
+          if (trace_level == "OFF" || trace_level == "TIMESTAMPS" ||
+              trace_level == "TENSORS") {
+            params_->trace_options["trace_level"] = {trace_level};
+          } else {
+            Usage(
+                "Failed to parse --trace-level. Unsupported type provided: '" +
+                trace_level +
+                "'. The available options are 'OFF', 'TIMESTAMPS', or "
+                "'TENSORS'.");
+          }
+          break;
+        }
+        case 46: {
+          params_->trace_options["trace_rate"] = {optarg};
+          break;
+        }
+        case 47: {
+          std::string trace_count{optarg};
+          if (std::stoi(trace_count) >= -1) {
+            params_->trace_options["trace_count"] = {trace_count};
+          } else {
+            Usage(
+                "Failed to parse --trace-count. The value must be >= 0 or set "
+                "to -1 (default).");
+          }
+          break;
+        }
+        case 48: {
+          std::string log_frequency{optarg};
+          if (std::stoi(log_frequency) >= 0) {
+            params_->trace_options["log_frequency"] = {log_frequency};
+          } else {
+            Usage("Failed to parse --log-frequency. The value must be >= 0.");
+          }
+          break;
+        }
+        case 49: {
+          params_->should_collect_metrics = true;
+          break;
+        }
+        case 50: {
+          params_->metrics_url = optarg;
+          params_->metrics_url_specified = true;
+          break;
+        }
+        case 51: {
+          std::string metrics_interval_ms{optarg};
+          if (std::stoi(metrics_interval_ms) > 0) {
+            params_->metrics_interval_ms = std::stoull(metrics_interval_ms);
+            params_->metrics_interval_ms_specified = true;
+          } else {
+            Usage(
+                "Failed to parse --metrics-interval. The value must be > 0 "
+                "msecs.");
+          }
+          break;
+        }
+        case 52: {
+          params_->sequence_length_variation = std::stod(optarg);
+          break;
+        }
+        case 53: {
+          std::string arg = optarg;
+
+          // Remove all spaces in the string
+          arg.erase(
+              std::remove_if(arg.begin(), arg.end(), ::isspace), arg.end());
+
+          std::stringstream ss(arg);
+          while (ss.good()) {
+            std::string model_name;
+            std::string model_version{""};
+            std::string tmp_model_name;
+
+            getline(ss, tmp_model_name, ',');
+
+            size_t colon_pos = tmp_model_name.find(":");
+
+            if (colon_pos == std::string::npos) {
+              model_name = tmp_model_name;
+            } else {
+              model_name = tmp_model_name.substr(0, colon_pos);
+              model_version = tmp_model_name.substr(colon_pos + 1);
+            }
+
+            params_->bls_composing_models.push_back(
+                {model_name, model_version});
+          }
+          break;
+        }
+        case 54: {
+          params_->serial_sequences = true;
+          break;
+        }
+        case 55: {
+          cb::TensorFormat input_tensor_format{ParseTensorFormat(optarg)};
+          if (input_tensor_format == cb::TensorFormat::UNKNOWN) {
+            Usage(
+                "Failed to parse --input-tensor-format. Unsupported type "
+                "provided: '" +
+                std::string{optarg} +
+                "'. The available options are 'binary' or 'json'.");
+          }
+          params_->input_tensor_format = input_tensor_format;
+          break;
+        }
+        case 56: {
+          cb::TensorFormat output_tensor_format{ParseTensorFormat(optarg)};
+          if (output_tensor_format == cb::TensorFormat::UNKNOWN) {
+            Usage(
+                "Failed to parse --output-tensor-format. Unsupported type "
+                "provided: '" +
+                std::string{optarg} +
+                "'. The available options are 'binary' or 'json'.");
+          }
+          params_->output_tensor_format = output_tensor_format;
+          break;
+        }
+        case 57: {
+          PrintVersion();
+          break;
+        }
+        case 58: {
+          std::string profile_export_file{optarg};
+          if (IsFile(profile_export_file) || IsDirectory(profile_export_file)) {
+            Usage(
+                "Failed to parse --profile-export-file. Path must not already "
+                "exist.");
+          }
+          params_->profile_export_file = profile_export_file;
+          break;
+        }
+        case 'v':
+          params_->extra_verbose = params_->verbose;
+          params_->verbose = true;
+          break;
+        case 'z':
+          params_->zero_input = true;
+          break;
+        case 'd':
+          params_->using_old_options = true;
+          params_->dynamic_concurrency_mode = true;
+          break;
+        case 'u':
+          params_->url_specified = true;
+          params_->url = optarg;
+          break;
+        case 'm':
+          params_->model_name = optarg;
+          break;
+        case 'x':
+          params_->model_version = optarg;
+          break;
+        case 'b': {
+          std::string batch_size{optarg};
+          if (std::stoi(batch_size) > 0) {
+            params_->batch_size = std::stoull(batch_size);
+            params_->using_batch_size = true;
+          } else {
+            Usage("Failed to parse -b (batch size). The value must be > 0.");
+          }
+          break;
+        }
+        case 't':
+          params_->using_old_options = true;
+          params_->concurrent_request_count = std::atoi(optarg);
+          break;
+        case 'i':
+          params_->protocol = ParseProtocol(optarg);
+          break;
+        case 'H': {
+          std::string arg = optarg;
+          std::string header = arg.substr(0, arg.find(":"));
+          (*params_->http_headers)[header] = arg.substr(header.size() + 1);
+          break;
+        }
+        case 'c':
+          params_->using_old_options = true;
+          params_->max_concurrency = std::atoi(optarg);
+          break;
+        case 'f':
+          params_->filename = optarg;
+          break;
+        case '?':
+          Usage();
+          break;
+      }
+    }
+    catch (const std::invalid_argument& ia) {
+      if (opt >= 'A') {  // short options
+        Usage(
+            "Failed to parse -" + std::string{(char)opt} +
+            ". Invalid value provided: " + std::string{optarg});
+      } else {
+        Usage(
+            "Failed to parse --" + std::string{long_options[opt].name} +
+            ". Invalid value provided: " + std::string{optarg});
+      }
+    }
+  }
+
+  params_->mpi_driver = std::shared_ptr<triton::perfanalyzer::MPIDriver>{
+      std::make_shared<triton::perfanalyzer::MPIDriver>(params_->enable_mpi)};
+  params_->mpi_driver->MPIInit(&argc, &argv);
+
+  if (!params_->url_specified &&
+      (params_->protocol == cb::ProtocolType::GRPC)) {
+    if (params_->kind == cb::BackendKind::TRITON) {
+      params_->url = "localhost:8001";
+    } else if (params_->kind == cb::BackendKind::TENSORFLOW_SERVING) {
+      params_->url = "localhost:8500";
+    }
+  }
+
+  // Overriding the max_threads default for request_rate search
+  if (!params_->max_threads_specified && params_->targeting_concurrency()) {
+    params_->max_threads = 16;
+  }
+
+  if (params_->using_custom_intervals) {
+    // Will be using user-provided time intervals, hence no control variable.
+    params_->search_mode = SearchMode::NONE;
+  }
+}
+
+void
+CLParser::VerifyOptions()
+{
+  if (params_->model_name.empty()) {
+    Usage("Failed to parse -m (model name). The value must be specified.");
+  }
+  if (params_->concurrency_range.start <= 0 ||
+      params_->concurrent_request_count < 0) {
+    Usage("The start of the search range must be > 0");
+  }
+  if (params_->request_rate_range[SEARCH_RANGE::kSTART] <= 0) {
+    Usage(
+        "Failed to parse --request-rate-range. The start of the search range "
+        "must be > 0.");
+  }
+  if (params_->protocol == cb::ProtocolType::UNKNOWN) {
+    Usage(
+        "Failed to parse -i (protocol). The value should be either HTTP or "
+        "gRPC.");
+  }
+  if (params_->streaming && (params_->protocol != cb::ProtocolType::GRPC)) {
+    Usage("Streaming is only allowed with gRPC protocol.");
+  }
+  if (params_->using_grpc_compression &&
+      (params_->protocol != cb::ProtocolType::GRPC)) {
+    Usage("Using compression algorithm is only allowed with gRPC protocol.");
+  }
+  if (params_->sequence_length_variation < 0.0) {
+    Usage(
+        "Failed to parse --sequence-length-variation. The value must be >= "
+        "0.0.");
+  }
+  if (params_->start_sequence_id == 0) {
+    params_->start_sequence_id = 1;
+    std::cerr << "WARNING: using an invalid start sequence id. Perf Analyzer"
+              << " will use default value if it is measuring on sequence model."
+              << std::endl;
+  }
+  if (params_->percentile != -1 &&
+      (params_->percentile > 99 || params_->percentile < 1)) {
+    Usage(
+        "Failed to parse --percentile. The value must be -1 for not reporting "
+        "or in range (0, 100).");
+  }
+  if (params_->zero_input && !params_->user_data.empty()) {
+    Usage("The -z flag cannot be set when --data-directory is provided.");
+  }
+  if (params_->async && params_->forced_sync) {
+    Usage("Cannot specify --async and --sync simultaneously.");
+  }
+
+  if (params_->using_concurrency_range && params_->using_old_options) {
+    Usage("Cannot use deprecated options with --concurrency-range.");
+  } else if (params_->using_old_options) {
+    if (params_->dynamic_concurrency_mode) {
+      params_->concurrency_range.end = params_->max_concurrency;
+    }
+    params_->concurrency_range.start = params_->concurrent_request_count;
+  }
+
+  if (params_->using_request_rate_range && params_->using_old_options) {
+    Usage("Cannot use concurrency options with --request-rate-range.");
+  }
+
+  if (params_->using_request_rate_range && params_->using_concurrency_range) {
+    Usage(
+        "Cannot specify --concurrency-range and --request-rate-range "
+        "simultaneously.");
+  }
+
+  if (params_->using_request_rate_range && params_->mpi_driver->IsMPIRun() &&
+      (params_->request_rate_range[SEARCH_RANGE::kEND] != 1.0 ||
+       params_->request_rate_range[SEARCH_RANGE::kSTEP] != 1.0)) {
+    Usage("Cannot specify --request-rate-range when in multi-model mode.");
+  }
+
+  if (params_->using_custom_intervals && params_->using_old_options) {
+    Usage("Cannot use deprecated options with --request-intervals.");
+  }
+
+  if ((params_->using_custom_intervals) &&
+      (params_->using_request_rate_range || params_->using_concurrency_range)) {
+    Usage(
+        "Cannot use --concurrency-range or --request-rate-range "
+        "along with --request-intervals.");
+  }
+
+  if (params_->using_concurrency_range && params_->mpi_driver->IsMPIRun() &&
+      (params_->concurrency_range.end != 1 ||
+       params_->concurrency_range.step != 1)) {
+    Usage("Cannot specify --concurrency-range when in multi-model mode.");
+  }
+
+  if (((params_->concurrency_range.end == NO_LIMIT) ||
+       (params_->request_rate_range[SEARCH_RANGE::kEND] ==
+        static_cast<double>(NO_LIMIT))) &&
+      (params_->latency_threshold_ms == NO_LIMIT)) {
+    Usage(
+        "The end of the search range and the latency limit can not be both 0 "
+        "(or 0.0) simultaneously");
+  }
+
+  if (((params_->concurrency_range.end == NO_LIMIT) ||
+       (params_->request_rate_range[SEARCH_RANGE::kEND] ==
+        static_cast<double>(NO_LIMIT))) &&
+      (params_->search_mode == SearchMode::BINARY)) {
+    Usage("The end of the range can not be 0 (or 0.0) for binary search mode.");
+  }
+
+  if ((params_->search_mode == SearchMode::BINARY) &&
+      (params_->latency_threshold_ms == NO_LIMIT)) {
+    Usage("The --latency-threshold cannot be 0 for binary search mode.");
+  }
+
+  if (((params_->concurrency_range.end < params_->concurrency_range.start) ||
+       (params_->request_rate_range[SEARCH_RANGE::kEND] <
+        params_->request_rate_range[SEARCH_RANGE::kSTART])) &&
+      (params_->search_mode == SearchMode::BINARY)) {
+    Usage(
+        "The end of the range can not be less than start of the range for "
+        "binary search mode.");
+  }
+
+  if (params_->kind == cb::TENSORFLOW_SERVING) {
+    if (params_->protocol != cb::ProtocolType::GRPC) {
+      Usage(
+          "perf_analyzer supports only grpc protocol for TensorFlow Serving.");
+    } else if (params_->streaming) {
+      Usage("perf_analyzer does not support streaming for TensorFlow Serving.");
+    } else if (params_->async) {
+      Usage("perf_analyzer does not support async API for TensorFlow Serving.");
+    } else if (!params_->using_batch_size) {
+      params_->batch_size = 0;
+    }
+  } else if (params_->kind == cb::TORCHSERVE) {
+    if (params_->user_data.empty()) {
+      Usage(
+          "--input-data should be provided with a json file with "
+          "input data for torchserve.");
+    }
+  }
+
+  if (params_->kind == cb::BackendKind::TRITON_C_API) {
+    if (params_->triton_server_path.empty()) {
+      Usage(
+          "--triton-server-path should not be empty when using "
+          "service-kind=triton_c_api.");
+    }
+
+    if (params_->model_repository_path.empty()) {
+      Usage(
+          "--model-repository should not be empty when using "
+          "service-kind=triton_c_api.");
+    }
+
+    if (params_->async) {
+      Usage(
+          "Async mode is not supported by triton_c_api service "
+          "kind.");
+    }
+
+    params_->protocol = cb::ProtocolType::UNKNOWN;
+  }
+
+  if (params_->should_collect_metrics &&
+      params_->kind != cb::BackendKind::TRITON) {
+    Usage(
+        "Server-side metric collection is only supported with Triton client "
+        "backend.");
+  }
+
+  if (params_->metrics_url_specified &&
+      params_->should_collect_metrics == false) {
+    Usage(
+        "Must specify --collect-metrics when using the --metrics-url option.");
+  }
+
+  if (params_->metrics_interval_ms_specified &&
+      params_->should_collect_metrics == false) {
+    Usage(
+        "Must specify --collect-metrics when using the --metrics-interval "
+        "option.");
+  }
+
+  if (params_->should_collect_metrics && !params_->metrics_url_specified) {
+    // Update the default metrics URL to be associated with the input URL
+    // instead of localhost
+    //
+    size_t colon_pos = params_->url.find(':');
+    if (colon_pos != std::string::npos) {
+      params_->metrics_url =
+          params_->url.substr(0, colon_pos) + ":8002/metrics";
+    }
+  }
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/command_line_parser.h
+++ b/src/c++/perf_analyzer/command_line_parser.h
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "constants.h"
+#include "mpi_utils.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+enum SEARCH_RANGE { kSTART = 0, kEND = 1, kSTEP = 2 };
+
+// Perf Analyzer command line parameters.
+// PAParams are used to initialize PerfAnalyzer and track configuration
+//
+struct PerfAnalyzerParameters {
+  bool verbose = false;
+  bool extra_verbose = false;
+  bool streaming = false;
+  size_t max_threads = 4;
+  bool max_threads_specified = false;
+  size_t sequence_length = 20;  // average length of a sentence
+  bool sequence_length_specified = false;
+  double sequence_length_variation = 20.0;
+  int32_t percentile = -1;
+  std::vector<std::string> user_data;
+  std::unordered_map<std::string, std::vector<int64_t>> input_shapes;
+  std::vector<cb::ModelIdentifier> bls_composing_models;
+  uint64_t measurement_window_ms = 5000;
+  bool using_concurrency_range = false;
+  Range<uint64_t> concurrency_range{1, 1, 1};
+  uint64_t latency_threshold_ms = NO_LIMIT;
+  double stability_threshold = 0.1;
+  size_t max_trials = 10;
+  bool zero_input = false;
+  size_t string_length = 128;
+  std::string string_data;
+  bool async = false;
+  bool forced_sync = false;
+  bool using_request_rate_range = false;
+  double request_rate_range[3] = {1.0, 1.0, 1.0};
+  uint32_t num_of_sequences = 4;
+  bool serial_sequences = false;
+  SearchMode search_mode = SearchMode::LINEAR;
+  Distribution request_distribution = Distribution::CONSTANT;
+  bool using_custom_intervals = false;
+  std::string request_intervals_file{""};
+  SharedMemoryType shared_memory_type = NO_SHARED_MEMORY;
+  size_t output_shm_size = 100 * 1024;
+  clientbackend::BackendKind kind = clientbackend::BackendKind::TRITON;
+  std::string model_signature_name{"serving_default"};
+  bool using_grpc_compression = false;
+  clientbackend::GrpcCompressionAlgorithm compression_algorithm =
+      clientbackend::GrpcCompressionAlgorithm::COMPRESS_NONE;
+  MeasurementMode measurement_mode = MeasurementMode::TIME_WINDOWS;
+  uint64_t measurement_request_count = 50;
+  std::string triton_server_path = "/opt/tritonserver";
+  std::string model_repository_path;
+  uint64_t start_sequence_id = 1;
+  uint64_t sequence_id_range = UINT32_MAX;
+  clientbackend::SslOptionsBase ssl_options;  // gRPC and HTTP SSL options
+
+  // Verbose csv option for including additional information
+  bool verbose_csv = false;
+
+  // Enable MPI option for using MPI functionality with multi-model mode.
+  bool enable_mpi = false;
+  std::map<std::string, std::vector<std::string>> trace_options;
+  bool using_old_options = false;
+  bool dynamic_concurrency_mode = false;
+  bool url_specified = false;
+  std::string url{"localhost:8000"};
+  std::string model_name;
+  std::string model_version;
+  uint64_t batch_size = 1;
+  bool using_batch_size = false;
+  int32_t concurrent_request_count = 1;
+  clientbackend::ProtocolType protocol = clientbackend::ProtocolType::HTTP;
+  std::shared_ptr<clientbackend::Headers> http_headers{
+      new clientbackend::Headers()};
+  size_t max_concurrency = 0;
+  std::string filename{""};
+  std::shared_ptr<MPIDriver> mpi_driver;
+  std::string memory_type{"system"};  // currently not used, to be removed
+
+  // Enable collection of server-side metrics from inference server.
+  bool should_collect_metrics{false};
+
+  // The URL to query for server-side inference server metrics.
+  std::string metrics_url{"localhost:8002/metrics"};
+  bool metrics_url_specified{false};
+
+  // How often, within each measurement window, to query for server-side
+  // inference server metrics.
+  uint64_t metrics_interval_ms{1000};
+  bool metrics_interval_ms_specified{false};
+
+  // Return true if targeting concurrency
+  //
+  bool targeting_concurrency() const
+  {
+    return (
+        using_concurrency_range || using_old_options ||
+        !(using_request_rate_range || using_custom_intervals));
+  }
+
+  // Sets the threshold for PA client overhead.
+  // Overhead is defined as the percentage of time when PA is doing work and
+  // requests are not outstanding to the triton server. If the overhead
+  // percentage exceeds the threshold, a warning is displayed.
+  //
+  double overhead_pct_threshold{50.0};
+
+  // Triton inference request input tensor format.
+  cb::TensorFormat input_tensor_format{cb::TensorFormat::BINARY};
+
+  // Triton inference response output tensor format.
+  cb::TensorFormat output_tensor_format{cb::TensorFormat::BINARY};
+
+  // The profile export file path.
+  std::string profile_export_file{""};
+};
+
+using PAParamsPtr = std::shared_ptr<PerfAnalyzerParameters>;
+
+class CLParser {
+ public:
+  CLParser() : params_(new PerfAnalyzerParameters{}) {}
+
+  // Parse command line arguments into a parameters struct
+  //
+  PAParamsPtr Parse(int argc, char** argv);
+
+ private:
+  char** argv_;
+  int argc_;
+  PAParamsPtr params_;
+
+  std::string FormatMessage(std::string str, int offset) const;
+  virtual void Usage(const std::string& msg = std::string());
+  void PrintVersion();
+  void ParseCommandLine(int argc, char** argv);
+  void VerifyOptions();
+};
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/concurrency_ctx_id_tracker.h
+++ b/src/c++/perf_analyzer/concurrency_ctx_id_tracker.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "base_queue_ctx_id_tracker.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Context ID Tracker that always returns context 0, but ensures that only X
+// requests are outstanding at a time
+//
+class ConcurrencyCtxIdTracker : public BaseQueueCtxIdTracker {
+ public:
+  ConcurrencyCtxIdTracker() = default;
+  void Reset(size_t count) override
+  {
+    Clear();
+
+    for (size_t i = 0; i < count; ++i) {
+      free_ctx_ids_.push(0);
+    }
+  }
+};
+
+}};  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/concurrency_manager.cc
+++ b/src/c++/perf_analyzer/concurrency_manager.cc
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "concurrency_manager.h"
+
+#include <queue>
+
+namespace triton { namespace perfanalyzer {
+
+ConcurrencyManager::~ConcurrencyManager()
+{
+  // The destruction of derived class should wait for all the request generator
+  // threads to finish
+  StopWorkerThreads();
+}
+
+cb::Error
+ConcurrencyManager::Create(
+    const bool async, const bool streaming, const int32_t batch_size,
+    const size_t max_threads, const size_t max_concurrency,
+    const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+    const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory,
+    std::unique_ptr<LoadManager>* manager)
+{
+  std::unique_ptr<ConcurrencyManager> local_manager(new ConcurrencyManager(
+      async, streaming, batch_size, max_threads, max_concurrency,
+      shared_memory_type, output_shm_size, parser, factory));
+
+  *manager = std::move(local_manager);
+
+  return cb::Error::Success;
+}
+
+ConcurrencyManager::ConcurrencyManager(
+    const bool async, const bool streaming, const int32_t batch_size,
+    const size_t max_threads, const size_t max_concurrency,
+    const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+    const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory)
+    : LoadManager(
+          async, streaming, batch_size, max_threads, shared_memory_type,
+          output_shm_size, parser, factory),
+      execute_(true), max_concurrency_(max_concurrency)
+{
+  threads_config_.reserve(max_threads);
+}
+
+void
+ConcurrencyManager::InitManagerFinalize()
+{
+  if (on_sequence_model_) {
+    sequence_manager_->InitSequenceStatuses(max_concurrency_);
+  }
+}
+
+cb::Error
+ConcurrencyManager::ChangeConcurrencyLevel(
+    const size_t concurrent_request_count)
+{
+  PauseSequenceWorkers();
+  ReconfigThreads(concurrent_request_count);
+  ResumeSequenceWorkers();
+
+  std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
+  return cb::Error::Success;
+}
+
+void
+ConcurrencyManager::PauseSequenceWorkers()
+{
+  if (on_sequence_model_) {
+    execute_ = false;
+    // Wait to see all threads are paused.
+    for (auto& thread_config : threads_config_) {
+      while (!thread_config->is_paused_) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      }
+    }
+  }
+}
+
+void
+ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
+{
+  // Always prefer to create new threads if the maximum limit has not been met
+  //
+  // While operating in synchronous mode, each context can send only one
+  // request at a time, hence the number of worker threads should be equal to
+  // the requested concurrency levels.
+  //
+  while ((concurrent_request_count > threads_.size()) &&
+         (threads_.size() < max_threads_)) {
+    // Launch new thread for inferencing
+    threads_stat_.emplace_back(new ThreadStat());
+    threads_config_.emplace_back(
+        new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
+
+    workers_.push_back(
+        MakeWorker(threads_stat_.back(), threads_config_.back()));
+
+    threads_.emplace_back(&IWorker::Infer, workers_.back());
+  }
+
+  {
+    // Make sure all threads are reconfigured before they are woken up
+    std::lock_guard<std::mutex> lock(wake_mutex_);
+
+    // Compute the new concurrency level for each thread (take floor)
+    // and spread the remaining value
+    size_t avg_concurrency = concurrent_request_count / threads_.size();
+    size_t threads_add_one = concurrent_request_count % threads_.size();
+    size_t seq_stat_index_offset = 0;
+    active_threads_ = 0;
+    for (size_t i = 0; i < threads_stat_.size(); i++) {
+      size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);
+
+      threads_config_[i]->concurrency_ = concurrency;
+      threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;
+      seq_stat_index_offset += concurrency;
+
+      if (concurrency) {
+        active_threads_++;
+      }
+    }
+
+    // TODO REFACTOR TMA-1043 the memory manager should have API to set
+    // num_active_threads in constructor, as well as overwrite it here
+  }
+}
+
+void
+ConcurrencyManager::ResumeSequenceWorkers()
+{
+  if (on_sequence_model_) {
+    execute_ = true;
+  }
+
+  // Make sure all threads will check their updated concurrency level
+  wake_signal_.notify_all();
+}
+
+std::shared_ptr<IWorker>
+ConcurrencyManager::MakeWorker(
+    std::shared_ptr<ThreadStat> thread_stat,
+    std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
+{
+  uint32_t id = workers_.size();
+
+  return std::make_shared<ConcurrencyWorker>(
+      id, thread_stat, thread_config, parser_, data_loader_, factory_,
+      on_sequence_model_, async_, max_concurrency_, using_json_data_,
+      streaming_, batch_size_, wake_signal_, wake_mutex_, active_threads_,
+      execute_, infer_data_manager_, sequence_manager_);
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/concurrency_manager.h
+++ b/src/c++/perf_analyzer/concurrency_manager.h
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include "concurrency_worker.h"
+#include "load_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class TestConcurrencyManager;
+#endif
+
+//==============================================================================
+/// ConcurrencyManager is a helper class to send inference requests to inference
+/// server consistently, based on the specified setting, so that the
+/// perf_analyzer can measure performance under different concurrency.
+///
+/// An instance of concurrency manager will be created at the beginning of the
+/// perf_analyzer and it will be used to simulate different load level in
+/// respect to number of concurrent infer requests and to collect per-request
+/// statistic.
+///
+/// Detail:
+/// Concurrency Manager will maintain the number of concurrent requests by
+/// spawning worker threads that keep sending randomly generated requests to the
+/// server. The worker threads will record the start time and end
+/// time of each request into a shared vector.
+///
+class ConcurrencyManager : public LoadManager {
+ public:
+  ~ConcurrencyManager();
+
+  /// Create a concurrency manager that is responsible to maintain specified
+  /// load on inference server.
+  /// \param async Whether to use asynchronous or synchronous API for infer
+  /// request.
+  /// \param streaming Whether to use gRPC streaming API for infer request
+  /// \param batch_size The batch size used for each request.
+  /// \param max_threads The maximum number of working threads to be spawned.
+  /// \param max_concurrency The maximum concurrency which will be requested.
+  /// \param string_length The length of the string to create for input.
+  /// \param string_data The data to use for generating string input.
+  /// \param zero_input Whether to fill the input tensors with zero.
+  /// \param user_data The vector containing path/paths to user-provided data
+  /// that can be a directory or path to a json data file.
+  /// \param shared_memory_type The type of shared memory to use for inputs.
+  /// \param output_shm_size The size in bytes of the shared memory to
+  /// allocate for the output.
+  /// \param parser The ModelParser object to get the model details.
+  /// \param factory The ClientBackendFactory object used to create
+  /// client to the server.
+  /// \param manager Returns a new ConcurrencyManager object.
+  /// \return cb::Error object indicating success or failure.
+  static cb::Error Create(
+      const bool async, const bool streaming, const int32_t batch_size,
+      const size_t max_threads, const size_t max_concurrency,
+      const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+      const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory,
+      std::unique_ptr<LoadManager>* manager);
+
+  /// Adjusts the number of concurrent requests to be the same as
+  /// 'concurrent_request_count' (by creating or pausing threads)
+  /// \param concurent_request_count The number of concurrent requests.
+  /// \return cb::Error object indicating success or failure.
+  cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
+
+ protected:
+  // Makes a new worker
+  virtual std::shared_ptr<IWorker> MakeWorker(
+      std::shared_ptr<ThreadStat>,
+      std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
+
+ private:
+  ConcurrencyManager(
+      const bool async, const bool streaming, const int32_t batch_size,
+      const size_t max_threads, const size_t max_concurrency,
+      const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+      const std::shared_ptr<ModelParser>& parser,
+      const std::shared_ptr<cb::ClientBackendFactory>& factory);
+
+  void InitManagerFinalize() override;
+
+  // Pause all worker threads that are working on sequences
+  //
+  void PauseSequenceWorkers();
+
+  // Create new threads (if necessary), and then reconfigure all worker threads
+  // to handle the new concurrent request count
+  //
+  void ReconfigThreads(size_t concurrent_request_count);
+
+  // Restart all worker threads that were working on sequences
+  //
+  void ResumeSequenceWorkers();
+
+  // The number of worker threads with non-zero concurrencies
+  size_t active_threads_;
+
+  bool execute_;
+
+  size_t max_concurrency_;
+  std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend TestConcurrencyManager;
+
+ public:
+  ConcurrencyManager() = default;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/concurrency_worker.cc
+++ b/src/c++/perf_analyzer/concurrency_worker.cc
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "concurrency_worker.h"
+
+#include <algorithm>
+
+#include "client_backend/client_backend.h"
+#include "perf_utils.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Function for worker threads.
+// If the model is non-sequence model, each worker uses only one context
+// to maintain concurrency assigned to worker.
+// If the model is sequence model, each worker has to use multiples contexts
+// to maintain (sequence) concurrency assigned to worker.
+void
+ConcurrencyWorker::Infer()
+{
+  CreateCtxIdTracker();
+  ReserveContexts();
+
+  // run inferencing until receiving exit signal to maintain server load.
+  do {
+    HandleExecuteOff();
+
+    if (HandleNoConcurrency()) {
+      return;
+    }
+
+    CreateContextsAsNecessary();
+
+    if (HandleExitConditions()) {
+      return;
+    }
+
+    SendInferRequests();
+
+    if (HandleExitConditions()) {
+      return;
+    }
+
+    WaitForResponses();
+
+    if (HandleExitConditions()) {
+      return;
+    }
+
+  } while (true);
+}
+
+void
+ConcurrencyWorker::CreateCtxIdTracker()
+{
+  bool is_concurrency = true;
+  bool serial_sequences = false;
+  ctx_id_tracker_ = CtxIdTrackerFactory::CreateTracker(
+      is_concurrency, on_sequence_model_, serial_sequences);
+}
+
+void
+ConcurrencyWorker::ReserveContexts()
+{
+  // Reserve the vectors in case of sequence models. In non-sequence or
+  // synchronous mode only one context will be opened hence no need of
+  // reserving.
+  if (on_sequence_model_ && async_) {
+    thread_stat_->contexts_stat_.reserve(max_concurrency_);
+    ctxs_.reserve(max_concurrency_);
+  }
+}
+
+void
+ConcurrencyWorker::HandleExecuteOff()
+{
+  if (on_sequence_model_) {
+    if (!execute_) {
+      // Ensures the clean exit of the sequences
+      CompleteOngoingSequences();
+      WaitForOngoingRequests();
+
+      // Reset Ctx IDs because CompleteOngoingSequences()
+      // has destructive side affects
+      ResetFreeCtxIds();
+
+      // Wait if no request should be sent and it is not exiting
+      thread_config_->is_paused_ = true;
+      std::unique_lock<std::mutex> lock(wake_mutex_);
+      wake_signal_.wait(lock, [this]() { return early_exit || execute_; });
+
+      // TODO REFACTOR TMA-1043 - memory manager should be handling this instead
+      // of here
+      for (auto ctx : ctxs_) {
+        ctx->SetNumActiveThreads(active_threads_);
+      }
+    }
+  }
+  thread_config_->is_paused_ = false;
+}
+
+bool
+ConcurrencyWorker::HandleNoConcurrency()
+{
+  // Only interact with synchronous mechanism if the worker should wait
+  if (thread_config_->concurrency_ == 0) {
+    // Wait if no request should be sent and it is not exiting
+    std::unique_lock<std::mutex> lock(wake_mutex_);
+    wake_signal_.wait(lock, [this]() {
+      return early_exit || (thread_config_->concurrency_ > 0);
+    });
+    // Stop executing if concurrency is 0 and early exit is requested
+    if (early_exit && thread_config_->concurrency_ == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void
+ConcurrencyWorker::CreateContextsAsNecessary()
+{
+  // If the model is non-sequence model, use one InferContext to
+  // maintain concurrency for this thread.
+  size_t active_ctx_cnt = on_sequence_model_ ? thread_config_->concurrency_ : 1;
+
+  if (active_ctx_cnt > ctxs_.size()) {
+    while (active_ctx_cnt > ctxs_.size()) {
+      CreateContext();
+    }
+    ResetFreeCtxIds();
+  }
+
+  // TODO REFACTOR TMA-1043 -- this shouldn't be handled here
+  for (auto ctx : ctxs_) {
+    ctx->SetNumActiveThreads(active_threads_);
+  }
+}
+
+void
+ConcurrencyWorker::SendInferRequests()
+{
+  while (ctx_id_tracker_->IsAvailable() && execute_ && !ShouldExit()) {
+    uint32_t ctx_id = GetCtxId();
+    SendInferRequest(ctx_id);
+    RestoreFreeCtxId(ctx_id);
+  }
+}
+
+
+void
+ConcurrencyWorker::WaitForResponses()
+{
+  if (async_) {
+    {
+      // If async, then wait for signal from callback.
+      std::unique_lock<std::mutex> lk(cb_mtx_);
+      thread_stat_->idle_timer.Start();
+      cb_cv_.wait(lk, [this] {
+        if (notified_) {
+          notified_ = false;
+          return true;
+        }
+        return false;
+      });
+      thread_stat_->idle_timer.Stop();
+    }
+  }
+}
+
+void
+ConcurrencyWorker::ResetFreeCtxIds()
+{
+  std::lock_guard<std::mutex> lock(cb_mtx_);
+  ctx_id_tracker_->Reset(thread_config_->concurrency_);
+}
+
+uint32_t
+ConcurrencyWorker::GetSeqStatIndex(uint32_t ctx_id)
+{
+  return (thread_config_->seq_stat_index_offset_ + ctx_id);
+}
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/concurrency_worker.h
+++ b/src/c++/perf_analyzer/concurrency_worker.h
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+
+#include "load_worker.h"
+#include "sequence_manager.h"
+
+namespace triton { namespace perfanalyzer {
+
+
+#ifndef DOCTEST_CONFIG_DISABLE
+class NaggyMockConcurrencyWorker;
+#endif
+
+/// Worker thread for the ConcurrencyManager
+///
+/// The worker maintains concurrency in different ways:
+///   For sequence models, multiple contexts must be created for multiple
+///   concurrent sequences.
+///
+///   For non-sequence models, one context can send out multiple requests
+///   at the same time. Thus it uses one single context as every infer context
+///   creates a worker thread implicitly.
+///
+class ConcurrencyWorker : public LoadWorker {
+ public:
+  struct ThreadConfig {
+    ThreadConfig(size_t thread_id)
+        : thread_id_(thread_id), concurrency_(0), seq_stat_index_offset_(0),
+          is_paused_(false)
+    {
+    }
+
+    // ID of corresponding worker thread
+    size_t thread_id_;
+
+    // The concurrency level that the worker should produce
+    size_t concurrency_;
+
+    // The starting sequence stat index for this worker
+    size_t seq_stat_index_offset_;
+
+    // Whether or not the thread is issuing new inference requests
+    bool is_paused_;
+  };
+
+  ConcurrencyWorker(
+      uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
+      std::shared_ptr<ThreadConfig> thread_config,
+      const std::shared_ptr<ModelParser> parser,
+      std::shared_ptr<DataLoader> data_loader,
+      const std::shared_ptr<cb::ClientBackendFactory> factory,
+      const bool on_sequence_model, const bool async,
+      const size_t max_concurrency, const bool using_json_data,
+      const bool streaming, const int32_t batch_size,
+      std::condition_variable& wake_signal, std::mutex& wake_mutex,
+      size_t& active_threads, bool& execute,
+      const std::shared_ptr<IInferDataManager>& infer_data_manager,
+      std::shared_ptr<SequenceManager> sequence_manager)
+      : LoadWorker(
+            id, thread_stat, parser, data_loader, factory, on_sequence_model,
+            async, streaming, batch_size, using_json_data, wake_signal,
+            wake_mutex, execute, infer_data_manager, sequence_manager),
+        thread_config_(thread_config), max_concurrency_(max_concurrency),
+        active_threads_(active_threads)
+  {
+  }
+
+  void Infer() override;
+
+ private:
+  const size_t max_concurrency_;
+  // TODO REFACTOR TMA-1020 can we decouple this thread from the total count of
+  // threads?
+  size_t& active_threads_;
+
+  std::shared_ptr<ThreadConfig> thread_config_;
+
+  void CreateCtxIdTracker();
+
+  // Reserve vector size for contexts
+  void ReserveContexts();
+
+  // Handle the case where execute_ is false
+  void HandleExecuteOff();
+
+  // Handle the case where this thread is configured to do nothing
+  // Returns true if an exit condition was met
+  bool HandleNoConcurrency();
+
+  // Create and populate contexts if needed
+  void CreateContextsAsNecessary();
+
+  // Send out the desired concurrency of requests
+  void SendInferRequests();
+
+  void WaitForResponses();
+
+  void ResetFreeCtxIds();
+
+  uint32_t GetSeqStatIndex(uint32_t ctx_id) override;
+
+  void CreateContextFinalize(std::shared_ptr<InferContext> ctx) override
+  {
+    ctx->RegisterAsyncCallbackFinalize(std::bind(
+        &ConcurrencyWorker::AsyncCallbackFinalize, this,
+        std::placeholders::_1));
+  }
+
+#ifndef DOCTEST_CONFIG_DISABLE
+  friend NaggyMockConcurrencyWorker;
+#endif
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/constants.h
+++ b/src/c++/perf_analyzer/constants.h
+// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#define STRINGIFY_(x) #x
+#define STRINGIFY(x) STRINGIFY_(x)
+namespace triton { namespace perfanalyzer {
+
+const std::string SHA{STRINGIFY(GIT_SHA)};
+const std::string VERSION{STRINGIFY(PERF_ANALYZER_VERSION)};
+
+constexpr static const uint32_t SUCCESS = 0;
+
+constexpr static const uint32_t STABILITY_ERROR = 2;
+constexpr static const uint32_t OPTION_ERROR = 3;
+
+constexpr static const uint32_t GENERIC_ERROR = 99;
+
+const double DELAY_PCT_THRESHOLD{1.0};
+
+/// Different measurement modes possible.
+enum MeasurementMode { TIME_WINDOWS = 0, COUNT_WINDOWS = 1 };
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/ctx_id_tracker_factory.h
+++ b/src/c++/perf_analyzer/ctx_id_tracker_factory.h
+// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <memory>
+
+#include "concurrency_ctx_id_tracker.h"
+#include "fifo_ctx_id_tracker.h"
+#include "rand_ctx_id_tracker.h"
+
+namespace triton { namespace perfanalyzer {
+
+// Context ID tracker that is always available and returns random Context IDs
+//
+class CtxIdTrackerFactory {
+ public:
+  CtxIdTrackerFactory() = delete;
+
+  /// Creates and returns a Context Id Tracker
+  ///
+  /// \param is_concurrency True if targeting Concurrency
+  /// \param is_sequence_model True if the model is a sequence model
+  /// \param serial_sequences True if in serial sequence mode
+  ///
+  static std::shared_ptr<ICtxIdTracker> CreateTracker(
+      bool is_concurrency, bool is_sequence_model, bool serial_sequences)
+  {
+    if (is_concurrency) {
+      if (is_sequence_model) {
+        return std::make_shared<FifoCtxIdTracker>();
+      } else {
+        return std::make_shared<ConcurrencyCtxIdTracker>();
+      }
+    } else {
+      if (is_sequence_model && serial_sequences) {
+        return std::make_shared<FifoCtxIdTracker>();
+      } else {
+        return std::make_shared<RandCtxIdTracker>();
+      }
+    }
+  }
+};
+
+}}  // namespace triton::perfanalyzer
--- a/src/c++/perf_analyzer/custom_load_manager.cc
+++ b/src/c++/perf_analyzer/custom_load_manager.cc
+// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "custom_load_manager.h"
+
+#include <fstream>
+
+#include "constants.h"
+
+namespace triton { namespace perfanalyzer {
+
+cb::Error
+CustomLoadManager::Create(
+    const bool async, const bool streaming,
+    const uint64_t measurement_window_ms, const size_t max_trials,
+    const std::string& request_intervals_file, const int32_t batch_size,
+    const size_t max_threads, const uint32_t num_of_sequences,
+    const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+    const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory,
+    std::unique_ptr<LoadManager>* manager)
+{
+  std::unique_ptr<CustomLoadManager> local_manager(new CustomLoadManager(
+      async, streaming, request_intervals_file, batch_size,
+      measurement_window_ms, max_trials, max_threads, num_of_sequences,
+      shared_memory_type, output_shm_size, serial_sequences, parser, factory));
+
+  *manager = std::move(local_manager);
+
+  return cb::Error::Success;
+}
+
+CustomLoadManager::CustomLoadManager(
+    const bool async, const bool streaming,
+    const std::string& request_intervals_file, int32_t batch_size,
+    const uint64_t measurement_window_ms, const size_t max_trials,
+    const size_t max_threads, const uint32_t num_of_sequences,
+    const SharedMemoryType shared_memory_type, const size_t output_shm_size,
+    const bool serial_sequences, const std::shared_ptr<ModelParser>& parser,
+    const std::shared_ptr<cb::ClientBackendFactory>& factory)
+    : RequestRateManager(
+          async, streaming, Distribution::CUSTOM, batch_size,
+          measurement_window_ms, max_trials, max_threads, num_of_sequences,
+          shared_memory_type, output_shm_size, serial_sequences, parser,
+          factory),
+      request_intervals_file_(request_intervals_file)
+{
+}
+
+cb::Error
+CustomLoadManager::InitCustomIntervals()
+{
+  PauseWorkers();
+  ConfigureThreads();
+  auto status = GenerateSchedule();
+  ResumeWorkers();
+  return status;
+}
+
+cb::Error
+CustomLoadManager::GenerateSchedule()
+{
+  if (request_intervals_file_.empty()) {
+    return cb::Error::Success;
+  }
+
+  RETURN_IF_ERROR(
+      ReadTimeIntervalsFile(request_intervals_file_, &custom_intervals_));
+
+  auto worker_schedules = CreateWorkerSchedules();
+  GiveSchedulesToWorkers(worker_schedules);
+  return cb::Error::Success;
+}
+
+std::vector<RateSchedulePtr_t>
+CustomLoadManager::CreateWorkerSchedules()
+{
+  std::vector<RateSchedulePtr_t> worker_schedules =
+      CreateEmptyWorkerSchedules();
+  std::vector<size_t> thread_ids{CalculateThreadIds()};
+
+  size_t thread_id_index = 0;
+  size_t worker_index = 0;
+  size_t intervals_index = 0;
+
+  std::chrono::nanoseconds next_timestamp(0);
+
+  bool started = false;
+
+  // Keep filling the schedule until both the thread_ids (which can differ if
+  // sequences are enabled) and the intervals are both at the end of their
+  // lists. This effectively finds the least common multiple of the two sizes
+  // and makes sure that the schedule is complete and can be repeated
+  // indefinitely
+  //
+  while (!started || thread_id_index != 0 || intervals_index != 0) {
+    started = true;
+    next_timestamp += custom_intervals_[intervals_index];
+    worker_index = thread_ids[thread_id_index];
+    worker_schedules[worker_index]->intervals.emplace_back(next_timestamp);
+
+    thread_id_index = (thread_id_index + 1) % thread_ids.size();
+    intervals_index = (intervals_index + 1) % custom_intervals_.size();
+  }
+
+  SetScheduleDurations(worker_schedules);
+
+  return worker_schedules;
+}
+
+cb::Error
+CustomLoadManager::GetCustomRequestRate(double* request_rate)
+{
+  if (custom_intervals_.empty()) {
+    return cb::Error("The custom intervals vector is empty", pa::GENERIC_ERROR);
+  }
+  uint64_t total_time_ns = 0;
+  for (auto interval : custom_intervals_) {
+    total_time_ns += interval.count();
+  }
+
+  *request_rate =
+      (custom_intervals_.size() * NANOS_PER_SECOND) / (total_time_ns);
+  return cb::Error::Success;
+}
+
+cb::Error
+CustomLoadManager::ReadTimeIntervalsFile(
+    const std::string& path, NanoIntervals* contents)
+{
+  std::ifstream in(path);
+  if (!in) {
+    return cb::Error("failed to open file '" + path + "'", pa::GENERIC_ERROR);
+  }
+
+  std::string current_string;
+  while (std::getline(in, current_string)) {
+    std::chrono::nanoseconds curent_time_interval_ns(
+        std::stol(current_string) * 1000);
+    contents->push_back(curent_time_interval_ns);
+  }
+  in.close();
+
+  if (contents->size() == 0) {
+    return cb::Error("file '" + path + "' is empty", pa::GENERIC_ERROR);
+  }
+  return cb::Error::Success;
+}
+
+}}  // namespace triton::perfanalyzer