Initial commit

c68e1835 · lijian6 · c68e1835 · c68e1835 · c68e1835 · c68e1835
Commit c68e1835 authored Sep 18, 2023 by lijian6
20 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+BasedOnStyle: Google
+IndentWidth: 2
+ColumnLimit: 80
+ContinuationIndentWidth: 4
+UseTab: Never
+MaxEmptyLinesToKeep: 2
+SortIncludes: true
+CompactNamespaces: true
+ReflowComments: true
+DerivePointerAlignment: false
+PointerAlignment: Left
+AllowShortIfStatementsOnASingleLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AlwaysBreakAfterReturnType: TopLevelDefinitions
+AlignAfterOpenBracket: AlwaysBreak
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: false
+  AfterStruct: false
+  AfterUnion: false
+  BeforeCatch: true
+BinPackArguments: true
+BinPackParameters: true
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+IndentCaseLabels: true
--- a/.gitignore
+++ b/.gitignore
+#VSCode
+/.vscode
+src/.vscode
+src/c++/.vscode
+src/python/.vscode
+#C++
+/build
+*.so
+src/c++/perf_analyzer/builddir/
+src/c++/perf_analyzer/.vscode/
+#Python
+__pycache__/
+*.pyc
+#Other
+node_modules
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+exclude: ^src/grpc_generated
+repos:
+- repo: https://github.com/timothycrosley/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+  rev: 23.1.0
+  hooks:
+  - id: black
+    types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+  rev: 5.0.4
+  hooks:
+  - id: flake8
+    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+    types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: v16.0.5
+  hooks:
+  - id: clang-format
+    types_or: [c, c++, cuda, proto, textproto, java]
+    args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.2.4
+  hooks:
+  - id: codespell
+    additional_dependencies: [tomli]
+    args: ["--toml", "pyproject.toml"]
+    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: check-case-conflict
+  - id: check-executables-have-shebangs
+  - id: check-merge-conflict
+  - id: check-json
+  - id: check-toml
+  - id: check-yaml
+  - id: check-shebang-scripts-are-executable
+  - id: end-of-file-fixer
+    types_or: [c, c++, cuda, proto, textproto, java, python]
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+cmake_minimum_required(VERSION 3.17)
+project(tritonclient LANGUAGES C CXX)
+#
+# Options
+#
+set(TRITON_VERSION "0.0.0" CACHE STRING "Version for the clients")
+set(PERF_ANALYZER_VERSION ${TRITON_VERSION} CACHE STRING "Build Version for Perf Analyzer")
+option(TRITON_ENABLE_CC_HTTP "Build C++ HTTP client libraries" OFF)
+option(TRITON_ENABLE_CC_GRPC "Build C++ GRPC client libraries" OFF)
+option(TRITON_ENABLE_PYTHON_HTTP "Enable Python HTTP client libraries" OFF)
+option(TRITON_ENABLE_PYTHON_GRPC "Enable Python GRPC client libraries" OFF)
+option(TRITON_ENABLE_JAVA_HTTP "Enable JAVA HTTP client libraries" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER "Enable Performance Analyzer" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER_C_API "Enable Performance Analyzer C API" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER_TFS "Enable TensorFlow Serving support for Performance Analyzer" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER_TS "Enable TorchServe support for Performance Analyzer" OFF)
+option(TRITON_ENABLE_EXAMPLES "Include examples in build" OFF)
+option(TRITON_ENABLE_TESTS "Include tests in build" OFF)
+option(TRITON_ENABLE_GPU "Enable GPU support in libraries" OFF)
+option(TRITON_ENABLE_ZLIB "Include ZLIB library in build" ON)
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_THIRD_PARTY_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/third_party repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+if(NOT TRITON_ENABLE_PYTHON_GRPC)
+  set(TRITON_COMMON_ENABLE_PROTOBUF_PYTHON OFF)
+endif()
+#
+# Dependencies
+#
+include(FetchContent)
+FetchContent_Declare(
+  repo-third-party
+  GIT_REPOSITORY https://github.com/triton-inference-server/third_party.git
+  GIT_TAG ${TRITON_THIRD_PARTY_REPO_TAG}
+  GIT_SHALLOW ON
+)
+set(TRITON_THIRD_PARTY_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/third-party)
+FetchContent_MakeAvailable(repo-third-party)
+# Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead
+# of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos
+set (LIB_DIR "lib")
+# /etc/os-release does not exist on Windows
+if(EXISTS "/etc/os-release")
+  file(STRINGS /etc/os-release DISTRO REGEX "^NAME=")
+  string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}")
+  message(STATUS "Distro Name: ${DISTRO}")
+  if(DISTRO MATCHES "CentOS.*")
+    set (LIB_DIR "lib64")
+  endif()
+endif()
+# Need to use ExternalProject for our builds so that we can get the
+# correct dependencies between our components and the ExternalProject
+# dependencies (found in the third_party repo)
+include(ExternalProject)
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(TRITON_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/cc-clients/install)
+else()
+  set(TRITON_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+endif()
+set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
+if (OPENSSL_ROOT_DIR)
+  set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
+endif()
+set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "")
+if (CMAKE_TOOLCHAIN_FILE)
+  set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "-DCMAKE_TOOLCHAIN_FILE:PATH=${CMAKE_TOOLCHAIN_FILE}")
+endif()
+set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "")
+if (VCPKG_TARGET_TRIPLET)
+  set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "-DVCPKG_TARGET_TRIPLET:STRING=${VCPKG_TARGET_TRIPLET}")
+endif()
+# Location where protobuf-config.cmake will be installed varies by
+# platform
+if (WIN32)
+  set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/cmake")
+else()
+  set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/${LIB_DIR}/cmake/protobuf")
+endif()
+if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER OR TRITON_ENABLE_PERF_ANALYZER_C_API)
+  set(_cc_client_depends "")
+  if(${TRITON_ENABLE_CC_HTTP})
+    set(_cc_client_depends ${_cc_client_depends} curl)
+  endif() # TRITON_ENABLE_CC_HTTP
+  if(${TRITON_ENABLE_CC_GRPC} OR ${TRITON_ENABLE_PERF_ANALYZER})
+    set(_cc_client_depends ${_cc_client_depends} grpc protobuf)
+  endif() # TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+  if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_C_API})
+    message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_C_API=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
+  endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_C_API
+  if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_TFS})
+    message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_TFS=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
+  endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_TFS
+  if(NOT ${TRITON_ENABLE_PERF_ANALYZER} AND ${TRITON_ENABLE_PERF_ANALYZER_TS})
+    message(FATAL_ERROR "TRITON_ENABLE_PERF_ANALYZER_TS=ON requires TRITON_ENABLE_PERF_ANALYZER=ON")
+  endif() # NOT TRITON_ENABLE_PERF_ANALYZER AND TRITON_ENABLE_PERF_ANALYZER_TS
+  ExternalProject_Add(cc-clients
+    PREFIX cc-clients
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src/c++"
+    BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/cc-clients"
+    CMAKE_CACHE_ARGS
+      ${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
+      ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
+      ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
+      -DCURL_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/curl/${LIB_DIR}/cmake/CURL
+      -DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
+      -DgRPC_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/grpc/lib/cmake/grpc
+      -Dabsl_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/absl/${LIB_DIR}/cmake/absl
+      -Dc-ares_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/c-ares/${LIB_DIR}/cmake/c-ares
+      -DGTEST_ROOT:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/googletest
+      -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG}
+      -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG}
+      -DPERF_ANALYZER_VERSION:STRING=${PERF_ANALYZER_VERSION}
+      -DTRITON_ENABLE_CC_HTTP:BOOL=${TRITON_ENABLE_CC_HTTP}
+      -DTRITON_ENABLE_CC_GRPC:BOOL=${TRITON_ENABLE_CC_GRPC}
+      -DTRITON_ENABLE_PERF_ANALYZER:BOOL=${TRITON_ENABLE_PERF_ANALYZER}
+      -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
+      -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
+      -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
+      -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
+      -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
+      -DTRITON_ENABLE_ZLIB:BOOL=${TRITON_ENABLE_ZLIB}
+      -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+      -DCMAKE_INSTALL_PREFIX:PATH=${TRITON_INSTALL_PREFIX}
+    DEPENDS ${_cc_client_depends}
+  )
+endif() # TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+if(TRITON_ENABLE_PYTHON_HTTP OR TRITON_ENABLE_PYTHON_GRPC)
+  set(_py_client_depends "")
+  if(${TRITON_ENABLE_PYTHON_GRPC})
+    set(_py_client_depends ${_py_client_depends} grpc protobuf)
+  endif() # TRITON_ENABLE_PYTHON_GRPC
+  if(${TRITON_ENABLE_PERF_ANALYZER})
+    set(_py_client_depends ${_py_client_depends} cc-clients)
+  endif() # TRITON_ENABLE_PERF_ANALYZER
+  ExternalProject_Add(python-clients
+    PREFIX python-clients
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src/python"
+    BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/python-clients"
+    CMAKE_CACHE_ARGS
+      ${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
+      ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
+      ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
+      -DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
+      -DgRPC_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/grpc/lib/cmake/grpc
+      -Dabsl_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/absl/${LIB_DIR}/cmake/absl
+      -Dc-ares_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/c-ares/${LIB_DIR}/cmake/c-ares
+      -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG}
+      -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG}
+      -DTRITON_VERSION:STRING=${TRITON_VERSION}
+      -DTRITON_ENABLE_PYTHON_HTTP:BOOL=${TRITON_ENABLE_PYTHON_HTTP}
+      -DTRITON_ENABLE_PYTHON_GRPC:BOOL=${TRITON_ENABLE_PYTHON_GRPC}
+      -DTRITON_ENABLE_PERF_ANALYZER:BOOL=${TRITON_ENABLE_PERF_ANALYZER}
+      -DTRITON_ENABLE_PERF_ANALYZER_C_API:BOOL=${TRITON_ENABLE_PERF_ANALYZER_C_API}
+      -DTRITON_ENABLE_PERF_ANALYZER_TFS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TFS}
+      -DTRITON_ENABLE_PERF_ANALYZER_TS:BOOL=${TRITON_ENABLE_PERF_ANALYZER_TS}
+      -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
+      -DTRITON_ENABLE_TESTS:BOOL=${TRITON_ENABLE_TESTS}
+      -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
+      -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+      -DCMAKE_INSTALL_PREFIX:PATH=${TRITON_INSTALL_PREFIX}
+    DEPENDS ${_py_client_depends}
+  )
+endif() # TRITON_ENABLE_PYTHON_HTTP OR TRITON_ENABLE_PYTHON_GRPC
+if(TRITON_ENABLE_JAVA_HTTP)
+  ExternalProject_Add(java-clients
+    PREFIX java-clients
+    SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src/java"
+    BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/java-clients"
+    CMAKE_CACHE_ARGS
+      ${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
+      ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
+      ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
+      -DTRITON_VERSION:STRING=${TRITON_VERSION}
+      -DTRITON_ENABLE_JAVA_HTTP:BOOL=${TRITON_ENABLE_JAVA_HTTP}
+      -DTRITON_ENABLE_EXAMPLES:BOOL=${TRITON_ENABLE_EXAMPLES}
+      -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+      -DCMAKE_INSTALL_PREFIX:PATH=${TRITON_INSTALL_PREFIX}
+    INSTALL_COMMAND ""
+  )
+endif() # TRITON_ENABLE_JAVA_HTTP
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
+<!--
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+# Triton Client Libraries and Examples
+To simplify communication with Triton, the Triton project provides
+several client libraries and examples of how to use those
+libraries. Ask questions or report problems in the main Triton [issues
+page](https://github.com/triton-inference-server/server/issues).
+The provided client libraries are:
+* [C++ and Python APIs](#client-library-apis) that make it easy to
+  communicate with Triton from your C++ or Python application. Using
+  these libraries you can send either HTTP/REST or GRPC requests to
+  Triton to access all its capabilities: inferencing, status and
+  health, statistics and metrics, model repository management,
+  etc. These libraries also support using system and CUDA shared
+  memory for passing inputs to and receiving outputs from Triton.
+* [Java API](#client-library-apis) (contributed by Alibaba Cloud PAI Team)
+  that makes it easy to communicate with Triton from your Java application
+  using HTTP/REST requests. For now, only a limited feature subset is supported.
+* The [protoc
+  compiler](https://developers.google.com/protocol-buffers/docs/tutorials)
+  can generate a GRPC API in a large number of programming
+  languages.
+    * See [src/grpc_generated/go](src/grpc_generated/go) for an example for the
+    [Go programming language](https://golang.org/).
+    * See [src/grpc_generated/java](src/grpc_generated/java) for an example for
+    the Java and Scala programming languages.
+    * See [src/grpc_generated/javascript](src/grpc_generated/javascript) for
+    an example with JavaScript programming language.
+There are also many example applications that show how to use these
+libraries. Many of these examples use models from the [example model
+repository](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md#create-a-model-repository).
+* C++ and Python versions of *image_client*, an example application
+  that uses the C++ or Python client library to execute image
+  classification models on Triton. See [Image Classification
+  Example](#image-classification-example).
+* Several simple [C++ examples](src/c%2B%2B/examples) show
+  how to use the C++ library to communicate with Triton to perform
+  inferencing and other task. The C++ examples demonstrating the
+  HTTP/REST client are named with a *simple_http_* prefix and the
+  examples demonstrating the GRPC client are named with a
+  *simple_grpc_* prefix. See [Simple Example
+  Applications](#simple-example-applications).
+* Several simple [Python examples](src/python/examples)
+  show how to use the Python library to communicate with Triton to
+  perform inferencing and other task. The Python examples
+  demonstrating the HTTP/REST client are named with a *simple_http_*
+  prefix and the examples demonstrating the GRPC client are named with
+  a *simple_grpc_* prefix. See [Simple Example
+  Applications](#simple-example-applications).
+* Several simple [Java
+  examples](src/java/src/main/java/triton/client/examples) show how to
+  use the Java API to communicate with Triton to perform inferencing
+  and other task.
+* A couple of [Python examples that communicate with Triton using a
+  Python GRPC API](src/python/examples) generated by the
+  [protoc compiler](https://grpc.io/docs/guides/). *grpc_client.py* is
+  a simple example that shows simple API
+  usage. *grpc_image_client.py* is functionally equivalent to
+  *image_client* but that uses a generated GRPC client stub to
+  communicate with Triton.
+## Getting the Client Libraries And Examples
+The easiest way to get the Python client library is to [use pip to
+install the tritonclient
+module](#download-using-python-package-installer-pip). You can also
+download the C++, Python and Java client libraries from [Triton GitHub
+release](#download-from-github), or [download a pre-built Docker image
+containing the client libraries](#download-docker-image-from-ngc) from
+[NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com).
+It is also possible to build the client libraries with
+[cmake](#build-using-cmake).
+### Download Using Python Package Installer (pip)
+The GRPC and HTTP client libraries are available as a Python package
+that can be installed using a recent version of pip.
+```
+$ pip install tritonclient[all]
+```
+Using *all* installs both the HTTP/REST and GRPC client
+libraries. There are two optional packages available, *grpc* and
+*http* that can be used to install support specifically for the
+protocol. For example, to install only the HTTP/REST client library
+use,
+```
+$ pip install tritonclient[http]
+```
+The components of the install packages are:
+* http
+* grpc [ `service_pb2`, `service_pb2_grpc`, `model_config_pb2` ]
+* utils [ linux distribution will include `shared_memory` and `cuda_shared_memory`]
+The Linux version of the package also includes the
+[perf_analyzer](src/c++/perf_analyzer/README.md)
+binary. The perf_analyzer binary is built on Ubuntu 20.04 and may not
+run on other Linux distributions. To run the perf_analyzer the
+following dependency must be installed:
+```bash
+$ sudo apt update
+$ sudo apt install libb64-dev
+```
+To reiterate, the installation on windows will not include perf_analyzer
+nor shared_memory/cuda_shared_memory components.
+### Download From GitHub
+The client libraries and the perf_analyzer executable can be
+downloaded from the [Triton GitHub release
+page](https://github.com/triton-inference-server/server/releases)
+corresponding to the release you are interested in. The client
+libraries are found in the "Assets" section of the release page in a
+tar file named after the version of the release and the OS, for
+example, v2.3.0_ubuntu2004.clients.tar.gz.
+The pre-built libraries can be used on the corresponding host system
+or you can install them into the Triton container to have both the
+clients and server in the same container.
+```bash
+$ mkdir clients
+$ cd clients
+$ wget https://github.com/triton-inference-server/server/releases/download/<tarfile_path>
+$ tar xzf <tarfile_name>
+```
+After installing, the libraries can be found in lib/, the headers in
+include/, the Python wheel files in python/, and the jar files in
+java/.  The bin/ and python/ directories contain the built examples
+that you can learn more about below.
+The perf_analyzer binary is built on Ubuntu 20.04 and may not run on
+other Linux distributions. To use the C++ libraries or perf_analyzer
+executable you must install some dependencies.
+```bash
+$ apt-get update
+$ apt-get install curl libcurl4-openssl-dev libb64-dev
+```
+### Download Docker Image From NGC
+A Docker image containing the client libraries and examples is
+available from [NVIDIA GPU Cloud
+(NGC)](https://ngc.nvidia.com). Before attempting to pull the
+container ensure you have access to NGC.  For step-by-step
+instructions, see the [NGC Getting Started
+Guide](http://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html).
+Use docker pull to get the client libraries and examples container
+from NGC.
+```bash
+$ docker pull nvcr.io/nvidia/tritonserver:<xx.yy>-py3-sdk
+```
+Where \<xx.yy\> is the version that you want to pull. Within the
+container the client libraries are in /workspace/install/lib, the
+corresponding headers in /workspace/install/include, and the Python
+wheel files in /workspace/install/python. The image will also contain
+the built client examples.
+**Important Note:** When running either the server or the client using
+Docker containers and using the
+[CUDA shared memory feature](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md#cuda-shared-memory)
+you need to add `--pid host` flag when launching the containers. The reason is
+that CUDA IPC APIs require the PID of the source and destination of the exported
+pointer to be different. Otherwise, Docker enables PID namespace which may
+result in equality between the source and destination PIDs. The error will be
+always observed when both of the containers are started in the non-interactive
+mode.
+### Build Using CMake
+The client library build is performed using CMake. To build the client
+libraries and examples with all features, first change directory to
+the root of this repo and checkout the release version of the branch
+that you want to build (or the *main* branch if you want to build the
+under-development version).
+```bash
+$ git checkout main
+```
+If building the Java client you must first install Maven and a JDK
+appropriate for your OS. For example, for Ubuntu you should install
+the `default-jdk` package:
+```
+$ apt-get install default-jdk maven
+```
+Building on Windows vs. non-Windows requires different invocations
+because Triton on Windows does not yet support all the build options.
+#### Non-Windows
+Use *cmake* to configure the build. You should adjust the flags depending on
+the components of Triton Client you are working and would like to build.
+For example, if you want to build Perf Analyzer with Triton C API, you can use \
+`-DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON`. You can
+also use `TRITON_ENABLE_PERF_ANALYZER_TFS` and `TRITON_ENABLE_PERF_ANALYZER_TS` flags
+to enable/disable support for TensorFlow Serving and TorchServe backend respectively in perf analyzer. \
+The following command demonstrate how to build client with all the features:
+```
+$ mkdir build
+$ cd build
+$ cmake -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON -DTRITON_ENABLE_PERF_ANALYZER_TS=ON -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON -DTRITON_ENABLE_JAVA_HTTP=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON ..
+```
+If you are building on a release branch (or on a development branch
+that is based off of a release branch), then you must also use
+additional cmake arguments to point to that release branch for repos
+that the client build depends on. For example, if you are building the
+r21.10 client branch then you need to use the following additional
+cmake flags:
+```
+-DTRITON_COMMON_REPO_TAG=r21.10
+-DTRITON_THIRD_PARTY_REPO_TAG=r21.10
+-DTRITON_CORE_REPO_TAG=r21.10
+```
+Then use *make* to build the clients and examples.
+```
+$ make cc-clients python-clients java-clients
+```
+When the build completes the libraries and examples can be found in
+the install directory.
+#### Windows
+To build the clients you must install an appropriate C++ compiler and
+other dependencies required for the build. The easiest way to do this
+is to create the [Windows min Docker
+image](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/build.md#windows-10-min-container)
+and the perform the build within a container launched from that image.
+```
+> docker run  -it --rm win10-py3-min powershell
+```
+It is not necessary to use Docker or the win10-py3-min container for
+the build, but if you do not you must install the appropriate
+dependencies onto your host system.
+Next use *cmake* to configure the build. If you are not building
+within the win10-py3-min container then you will likely need to adjust
+the CMAKE_TOOLCHAIN_FILE location in the following command.
+```
+$ mkdir build
+$ cd build
+$ cmake -DVCPKG_TARGET_TRIPLET=x64-windows -DCMAKE_TOOLCHAIN_FILE='/vcpkg/scripts/buildsystems/vcpkg.cmake' -DCMAKE_INSTALL_PREFIX=install -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PYTHON_GRPC=ON -DTRITON_ENABLE_GPU=OFF -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON ..
+```
+If you are building on a release branch (or on a development branch
+that is based off of a release branch), then you must also use
+additional cmake arguments to point to that release branch for repos
+that the client build depends on. For example, if you are building the
+r21.10 client branch then you need to use the following additional
+cmake flags:
+```
+-DTRITON_COMMON_REPO_TAG=r21.10
+-DTRITON_THIRD_PARTY_REPO_TAG=r21.10
+-DTRITON_CORE_REPO_TAG=r21.10
+```
+Then use msbuild.exe to build.
+```
+$ msbuild.exe cc-clients.vcxproj -p:Configuration=Release -clp:ErrorsOnly
+$ msbuild.exe python-clients.vcxproj -p:Configuration=Release -clp:ErrorsOnly
+```
+When the build completes the libraries and examples can be found in
+the install directory.
+## Client Library APIs
+The C++ client API exposes a class-based interface. The commented
+interface is available in
+[grpc_client.h](src/c%2B%2B/library/grpc_client.h),
+[http_client.h](src/c%2B%2B/library/http_client.h),
+[common.h](src/c%2B%2B/library/common.h).
+The Python client API provides similar capabilities as the C++
+API. The commented interface is available in
+[grpc](src/python/library/tritonclient/grpc/__init__.py)
+and
+[http](src/python/library/tritonclient/http/__init__.py).
+The Java client API provides similar capabilities as the Python API
+with similar classes and methods.  For more information please refer
+to the [Java client directory](src/java).
+### HTTP Options
+#### SSL/TLS
+The client library allows communication across a secured channel using HTTPS protocol. Just setting these SSL options do not ensure the secure communication. Triton server should be running behind `https://` proxy such as nginx. The client can then establish a secure channel to the proxy. The [`qa/L0_https`](https://github.com/triton-inference-server/server/blob/main/qa/L0_https/test.sh) in the server repository demonstrates how this can be achieved.
+For C++ client, see `HttpSslOptions` struct that encapsulates these options in [http_client.h](src/c%2B%2B/library/http_client.h).
+For Python client, look for the following options in [http/\_\_init\_\_.py](src/python/library/tritonclient/http/__init__.py):
+* ssl
+* ssl_options
+* ssl_context_factory
+* insecure
+The [C++](src/c%2B%2B/examples/simple_http_infer_client.cc) and [Python](src/python/examples/simple_http_infer_client.py) examples
+demonstrates how to use SSL/TLS settings on client side.
+#### Compression
+The client library enables on-wire compression for HTTP transactions.
+For C++ client, see `request_compression_algorithm` and `response_compression_algorithm` parameters in the `Infer` and `AsyncInfer` functions in [http_client.h](src/c%2B%2B/library/http_client.h). By default, the parameter is set as `CompressionType::NONE`.
+Similarly, for Python client, see `request_compression_algorithm` and `response_compression_algorithm` parameters in `infer` and `async_infer` functions in [http/\_\_init\_\_.py](src/python/library/tritonclient/http/__init__.py).
+The [C++](src/c%2B%2B/examples/simple_http_infer_client.cc) and [Python](src/python/examples/simple_http_infer_client.py) examples demonstrates how to use compression options.
+#### Python AsyncIO Support (Beta)
+*This feature is currently in beta and may be subject to change.*
+Advanced users may call the Python client via `async` and `await` syntax. The
+[infer](src/python/examples/simple_http_aio_infer_client.py) example
+demonstrates how to infer with AsyncIO.
+If using SSL/TLS with AsyncIO, look for the `ssl` and `ssl_context` options in
+[http/aio/\_\_init\_\_.py](src/python/library/tritonclient/http/aio/__init__.py)
+#### Python Client Plugin API (Beta)
+*This feature is currently in beta and may be subject to change.*
+The Triton Client Plugin API lets you register custom plugins to add or modify
+request headers. This is useful if you have gateway in front of Triton Server
+that requires extra headers for each request, such as HTTP Authorization. By
+registering the plugin, your gateway will work with Python clients without
+additional configuration. Note that Triton Server does not implement
+authentication or authorization mechanisms  and similarly,
+Triton Server is not the direct consumer of the additional headers.
+The plugin must implement the `__call__` method. The signature
+of the `__call__` method should look like below:
+```python
+class MyPlugin:
+  def __call__(self, request):
+       """This method will be called for every HTTP request. Currently, the only
+       field that can be accessed by the request object is the `request.headers`
+       field. This field must be updated in-place.
+       """
+       request.headers['my-header-key'] = 'my-header-value'
+```
+After the plugin implementation is complete, you can register the
+plugin by calling `register` on the `InferenceServerClient` object.
+```python
+from tritonclient.http import InferenceServerClient
+client = InferenceServerClient(...)
+# Register the plugin
+my_plugin = MyPlugin()
+client.register_plugin(my_plugin)
+# All the method calls will update the headers according to the plugin
+# implementation.
+client.infer(...)
+```
+To unregister the plugin, you can call the `client.unregister_plugin()`
+function.
+##### Basic Auth
+You can register the `BasicAuth` plugin that implements
+[Basic Authentication](https://en.wikipedia.org/wiki/Basic_access_authentication).
+```python
+from tritonclient.grpc.auth import BasicAuth
+from tritonclient.grpc import InferenceServerClient
+basic_auth = BasicAuth('username', 'password')
+client = InferenceServerClient('...')
+client.register_plugin(basic_auth)
+```
+The example above shows how to register the plugin for
+gRPC client. The `BasicAuth` plugin can be registered
+similarly for HTTP and
+[AsyncIO](#python-asyncio-support-beta)
+clients.
+### GRPC Options
+#### SSL/TLS
+The client library allows communication across a secured channel using gRPC protocol.
+For C++ client, see `SslOptions` struct that encapsulates these options in [grpc_client.h](src/c%2B%2B/library/grpc_client.h).
+For Python client, look for the following options in [grpc/\_\_init\_\_.py](src/python/library/tritonclient/grpc/__init__.py):
+* ssl
+* root_certificates
+* private_key
+* certificate_chain
+The [C++](src/c%2B%2B/examples/simple_grpc_infer_client.cc) and [Python](src/python/examples/simple_grpc_infer_client.py) examples
+demonstrates how to use SSL/TLS settings on client side. For information on the corresponding server-side parameters, refer to the
+[server documentation](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#ssltls)
+#### Compression
+The client library also exposes options to use on-wire compression for gRPC transactions.
+For C++ client, see `compression_algorithm` parameter in the `Infer`, `AsyncInfer` and `StartStream` functions in [grpc_client.h](src/c%2B%2B/library/grpc_client.h). By default, the parameter is set as `GRPC_COMPRESS_NONE`.
+Similarly, for Python client, see `compression_algorithm` parameter in `infer`, `async_infer` and `start_stream` functions in [grpc/\_\_init\_\_.py](src/python/library/tritonclient/grpc/__init__.py).
+The [C++](src/c%2B%2B/examples/simple_grpc_infer_client.cc) and [Python](src/python/examples/simple_grpc_infer_client.py) examples demonstrates how to configure compression for clients. For information on the corresponding server-side parameters, refer to the [server documentation](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#compression).
+#### GRPC KeepAlive
+Triton exposes GRPC KeepAlive parameters with the default values for both
+client and server described [here](https://github.com/grpc/grpc/blob/master/doc/keepalive.md).
+You can find a `KeepAliveOptions` struct/class that encapsulates these
+parameters in both the [C++](src/c%2B%2B/library/grpc_client.h) and
+[Python](src/python/library/tritonclient/grpc/__init__.py) client libraries.
+There is also a [C++](src/c%2B%2B/examples/simple_grpc_keepalive_client.cc) and
+[Python](src/python/examples/simple_grpc_keepalive_client.py) example
+demonstrating how to setup these parameters on the client-side. For information
+on the corresponding server-side parameters, refer to the
+[server documentation](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inference_protocols.md#grpc-keepalive)
+#### Custom GRPC Channel Arguments
+Advanced users may require specific client-side GRPC Channel Arguments that are
+not currently exposed by Triton through direct means. To support this, Triton
+allows users to pass custom channel arguments upon creating a GRPC client. When
+using this option, it is up to the user to pass a valid combination of arguments
+for their use case; Triton cannot feasibly test every possible combination of
+channel arguments.
+There is a [C++](src/c%2B%2B/examples/simple_grpc_custom_args_client.cc) and
+[Python](src/python/examples/simple_grpc_custom_args_client.py) example
+demonstrating how to construct and pass these custom arguments upon creating
+a GRPC client.
+You can find a comprehensive list of possible GRPC Channel Arguments
+[here](https://grpc.github.io/grpc/core/group__grpc__arg__keys.html).
+#### Python AsyncIO Support (Beta)
+*This feature is currently in beta and may be subject to change.*
+Advanced users may call the Python client via `async` and `await` syntax. The
+[infer](src/python/examples/simple_grpc_aio_infer_client.py) and
+[stream](src/python/examples/simple_grpc_aio_sequence_stream_infer_client.py)
+examples demonstrate how to infer with AsyncIO.
+## Simple Example Applications
+This section describes several of the simple example applications and
+the features that they illustrate.
+### Bytes/String Datatype
+Some frameworks support tensors where each element in the tensor is
+variable-length binary data. Each element can hold a string or an
+arbitrary sequence of bytes. On the client this datatype is BYTES (see
+[Datatypes](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#datatypes)
+for information on supported datatypes).
+The Python client library uses numpy to represent input and output
+tensors. For BYTES tensors the dtype of the numpy array should be
+'np.object_' as shown in the examples. For backwards compatibility
+with previous versions of the client library, 'np.bytes_' can also be
+used for BYTES tensors. However, using 'np.bytes_' is not recommended
+because using this dtype will cause numpy to remove all trailing zeros
+from each array element. As a result, binary sequences ending in
+zero(s) will not be represented correctly.
+BYTES tensors are demonstrated in the C++ example applications
+simple_http_string_infer_client.cc and
+simple_grpc_string_infer_client.cc.  String tensors are demonstrated
+in the Python example application simple_http_string_infer_client.py
+and simple_grpc_string_infer_client.py.
+### System Shared Memory
+Using system shared memory to communicate tensors between the client
+library and Triton can significantly improve performance in some
+cases.
+Using system shared memory is demonstrated in the C++ example
+applications simple_http_shm_client.cc and simple_grpc_shm_client.cc.
+Using system shared memory is demonstrated in the Python example
+application simple_http_shm_client.py and simple_grpc_shm_client.py.
+Python does not have a standard way of allocating and accessing shared
+memory so as an example a simple [system shared memory
+module](src/python/library/tritonclient/utils/shared_memory)
+is provided that can be used with the Python client library to create,
+set and destroy system shared memory.
+### CUDA Shared Memory
+Using CUDA shared memory to communicate tensors between the client
+library and Triton can significantly improve performance in some
+cases.
+Using CUDA shared memory is demonstrated in the C++ example
+applications simple_http_cudashm_client.cc and
+simple_grpc_cudashm_client.cc.  Using CUDA shared memory is
+demonstrated in the Python example application
+simple_http_cudashm_client.py and simple_grpc_cudashm_client.py.
+Python does not have a standard way of allocating and accessing shared
+memory so as an example a simple [CUDA shared memory
+module](src/python/library/tritonclient/utils/cuda_shared_memory)
+is provided that can be used with the Python client library to create,
+set and destroy CUDA shared memory. The module currently supports
+numpy arrays ([example usage](src/python/examples/simple_http_cudashm_client.py))
+and DLPack tensors ([example usage](src/python/library/tests/test_dlpack.py)).
+### Client API for Stateful Models
+When performing inference using a [stateful
+model](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#stateful-models),
+a client must identify which inference requests belong to the same
+sequence and also when a sequence starts and ends.
+Each sequence is identified with a sequence ID that is provided when
+an inference request is made. It is up to the clients to create a
+unique sequence ID. For each sequence the first inference request
+should be marked as the start of the sequence and the last inference
+requests should be marked as the end of the sequence.
+The use of sequence ID and start and end flags are demonstrated in the
+C++ example applications simple_http_sequence_stream_infer_client.cc
+and simple_grpc_sequence_stream_infer_client.cc.  The use of sequence
+ID and start and end flags are demonstrated in the Python example
+application simple_http_sequence_stream_infer_client.py and
+simple_grpc_sequence_stream_infer_client.py.
+## Image Classification Example
+The image classification example that uses the C++ client API is
+available at
+[src/c++/examples/image_client.cc](src/c%2B%2B/examples/image_client.cc). The
+Python version of the image classification client is available at
+[src/python/examples/image_client.py](src/python/examples/image_client.py).
+To use image_client (or image_client.py) you must first have a running
+Triton that is serving one or more image classification models. The
+image_client application requires that the model have a single image
+input and produce a single classification output. If you don't have a
+model repository with image classification models see
+[QuickStart](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md)
+for instructions on how to create one.
+Once Triton is running you can use the image_client application to
+send inference requests. You can specify a single image or a directory
+holding images. Here we send a request for the inception_graphdef
+model for an image from the
+[qa/images](https://github.com/triton-inference-server/server/tree/main/qa/images).
+```bash
+$ image_client -m inception_graphdef -s INCEPTION qa/images/mug.jpg
+Request 0, batch size 1
+Image 'qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+```
+The Python version of the application accepts the same command-line
+arguments.
+```bash
+$ python image_client.py -m inception_graphdef -s INCEPTION qa/images/mug.jpg
+Request 0, batch size 1
+Image 'qa/images/mug.jpg':
+     0.826384 (505) = COFFEE MUG
+```
+The image_client and image_client.py applications use the client
+libraries to talk to Triton. By default image_client instructs the
+client library to use HTTP/REST protocol, but you can use the GRPC
+protocol by providing the -i flag. You must also use the -u flag to
+point at the GRPC endpoint on Triton.
+```bash
+$ image_client -i grpc -u localhost:8001 -m inception_graphdef -s INCEPTION qa/images/mug.jpg
+Request 0, batch size 1
+Image 'qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+```
+By default the client prints the most probable classification for the
+image. Use the -c flag to see more classifications.
+```bash
+$ image_client -m inception_graphdef -s INCEPTION -c 3 qa/images/mug.jpg
+Request 0, batch size 1
+Image 'qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+    0.157077 (969) = CUP
+    0.002880 (968) = ESPRESSO
+```
+The -b flag allows you to send a batch of images for inferencing.
+The image_client application will form the batch from the image or
+images that you specified. If the batch is bigger than the number of
+images then image_client will just repeat the images to fill the
+batch.
+```bash
+$ image_client -m inception_graphdef -s INCEPTION -c 3 -b 2 qa/images/mug.jpg
+Request 0, batch size 2
+Image 'qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+    0.157077 (969) = CUP
+    0.002880 (968) = ESPRESSO
+Image 'qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+    0.157077 (969) = CUP
+    0.002880 (968) = ESPRESSO
+```
+Provide a directory instead of a single image to perform inferencing
+on all images in the directory.
+```
+$ image_client -m inception_graphdef -s INCEPTION -c 3 -b 2 qa/images
+Request 0, batch size 2
+Image '/opt/tritonserver/qa/images/car.jpg':
+    0.819196 (818) = SPORTS CAR
+    0.033457 (437) = BEACH WAGON
+    0.031232 (480) = CAR WHEEL
+Image '/opt/tritonserver/qa/images/mug.jpg':
+    0.754130 (505) = COFFEE MUG
+    0.157077 (969) = CUP
+    0.002880 (968) = ESPRESSO
+Request 1, batch size 2
+Image '/opt/tritonserver/qa/images/vulture.jpeg':
+    0.977632 (24) = VULTURE
+    0.000613 (9) = HEN
+    0.000560 (137) = EUROPEAN GALLINULE
+Image '/opt/tritonserver/qa/images/car.jpg':
+    0.819196 (818) = SPORTS CAR
+    0.033457 (437) = BEACH WAGON
+    0.031232 (480) = CAR WHEEL
+```
+The [grpc_image_client.py](src/python/examples/grpc_image_client.py)
+application behaves the same as the image_client except that instead
+of using the client library it uses the GRPC generated library to
+communicate with Triton.
+## Ensemble Image Classification Example Application
+In comparison to the image classification example above, this example
+uses an ensemble of an image-preprocessing model implemented as a
+[DALI
+backend](https://github.com/triton-inference-server/dali_backend) and
+a TensorFlow Inception model. The ensemble model allows you to send
+the raw image binaries in the request and receive classification
+results without preprocessing the images on the client.
+To try this example you should follow the [DALI ensemble example
+instructions](https://github.com/triton-inference-server/dali_backend/tree/main/docs/examples/inception_ensemble).
--- a/pyproject.toml
+++ b/pyproject.toml
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+[tool.codespell]
+# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
+# this is only to allow you to run codespell interactively
+# this also overrides the grpc_generated folder, since it is generated
+skip = "./.git,./.github,./src/grpc_generated"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+# ignore allowed words
+# ignoring atleast to avoid testing::AtLeast from getting flagged
+ignore-words-list = "atleast"
+# use the 'clear' dictionary for unambiguous spelling mistakes
+builtin = "clear"
+# disable warnings about binary files and wrong encoding
+quiet-level = 3
+[tool.isort]
+profile = "black"
+use_parentheses = true
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+ensure_newline_before_comments = true
+line_length = 88
+balanced_wrapping = true
+indent = "    "
+skip = ["build"]
--- a/src/c++/CMakeLists.txt
+++ b/src/c++/CMakeLists.txt
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+cmake_minimum_required(VERSION 3.17)
+project(cc-clients LANGUAGES C CXX)
+#
+# Options
+#
+option(TRITON_ENABLE_CC_HTTP "Build C++ HTTP client libraries" OFF)
+option(TRITON_ENABLE_CC_GRPC "Build C++ GRPC client libraries" OFF)
+option(TRITON_ENABLE_PERF_ANALYZER "Enable Performance Analyzer" OFF)
+option(TRITON_ENABLE_EXAMPLES "Include examples in build" OFF)
+option(TRITON_ENABLE_TESTS "Include tests in build" OFF)
+option(TRITON_ENABLE_GPU "Enable GPU support in libraries" OFF)
+option(TRITON_USE_THIRD_PARTY "Use local version of third party libraries" ON)
+option(TRITON_KEEP_TYPEINFO "Keep typeinfo symbols by disabling ldscript" OFF)
+option(TRITON_ENABLE_ZLIB "Include ZLIB library in build" ON)
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+#
+# Dependencies
+#
+include(FetchContent)
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  googletest
+  URL https://github.com/google/googletest/archive/9406a60c7839052e4944ea4dbc8344762a89f9bd.zip
+)
+if(TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER)
+  set(TRITON_COMMON_ENABLE_PROTOBUF ON)
+  set(TRITON_COMMON_ENABLE_GRPC ON)
+  if(TRITON_ENABLE_PERF_ANALYZER)
+    FetchContent_Declare(
+      repo-core
+      GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+      GIT_TAG ${TRITON_CORE_REPO_TAG}
+      GIT_SHALLOW ON
+    )
+    FetchContent_MakeAvailable(repo-core)
+  endif() # TRITON_ENABLE_PERF_ANALYZER
+endif() # TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+if(NOT TRITON_ENABLE_PERF_ANALYZER AND NOT TRITON_ENABLE_CC_HTTP AND NOT TRITON_ENABLE_EXAMPLES)
+  set(TRITON_COMMON_ENABLE_JSON OFF)
+endif()
+if(TRITON_ENABLE_TESTS OR TRITON_ENABLE_PERF_ANALYZER)
+  FetchContent_MakeAvailable(googletest)
+endif()
+FetchContent_MakeAvailable(repo-common)
+if(TRITON_ENABLE_TESTS)
+  include_directories(
+    ${repo-common_SOURCE_DIR}/include
+  )
+endif() # TRITON_ENABLE_TESTS
+#
+# CUDA
+#
+if(TRITON_ENABLE_GPU)
+  find_package(CUDAToolkit REQUIRED)
+endif() # TRITON_ENABLE_GPU
+#
+# libcurl
+#
+if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_PERF_ANALYZER)
+  find_package(CURL REQUIRED)
+  message(STATUS "Using curl ${CURL_VERSION}")
+endif() # TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_PERF_ANALYZER
+#
+# Protobuf
+#
+if(TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER)
+  set(protobuf_MODULE_COMPATIBLE TRUE CACHE BOOL "protobuf_MODULE_COMPATIBLE" FORCE)
+  find_package(Protobuf CONFIG REQUIRED)
+  message(STATUS "Using protobuf ${Protobuf_VERSION}")
+  include_directories(${Protobuf_INCLUDE_DIRS})
+endif() # TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+#
+# GRPC
+#
+if(TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER)
+  find_package(gRPC CONFIG REQUIRED)
+  message(STATUS "Using gRPC ${gRPC_VERSION}")
+  include_directories($<TARGET_PROPERTY:gRPC::grpc,INTERFACE_INCLUDE_DIRECTORIES>)
+endif() # TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER)
+  add_subdirectory(library)
+endif() # TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC OR TRITON_ENABLE_PERF_ANALYZER
+if(TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC)
+  if(TRITON_ENABLE_EXAMPLES)
+    add_subdirectory(examples)
+  endif() # TRITON_ENABLE_EXAMPLES
+  if(TRITON_ENABLE_TESTS)
+    add_subdirectory(tests)
+  endif() # TRITON_ENABLE_TESTS
+endif() # TRITON_ENABLE_CC_HTTP OR TRITON_ENABLE_CC_GRPC
+if(TRITON_ENABLE_PERF_ANALYZER)
+  add_subdirectory(perf_analyzer)
+endif() # TRITON_ENABLE_PERF_ANALYZER
--- a/src/c++/examples/CMakeLists.txt
+++ b/src/c++/examples/CMakeLists.txt
+# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+cmake_minimum_required (VERSION 3.18)
+if(WIN32)
+  message("C++ examples are not currently supported on Windows because "
+          "they require functionalities that are UNIX specific.")
+else()
+if(TRITON_ENABLE_CC_HTTP AND TRITON_ENABLE_CC_GRPC)
+  #
+  # yolov7-tiny
+  #
+  find_package(OpenCV REQUIRED)
+  add_executable(
+    yolov7-tiny
+    yolov7-tiny.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_include_directories(
+    yolov7-tiny
+    PRIVATE ${OpenCV_INCLUDE_DIRS}
+  )
+  target_link_libraries(
+    yolov7-tiny
+    PRIVATE
+      grpcclient_static
+      httpclient_static
+      ${OpenCV_LIBS}
+  )
+  install(
+    TARGETS yolov7-tiny
+    RUNTIME DESTINATION bin
+  )
+  #
+  # resnet50
+  #
+  find_package(OpenCV REQUIRED)
+  add_executable(
+    resnet50
+    resnet50.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_include_directories(
+    resnet50
+    PRIVATE ${OpenCV_INCLUDE_DIRS}
+  )
+  target_link_libraries(
+    resnet50
+    PRIVATE
+      grpcclient_static
+      httpclient_static
+      ${OpenCV_LIBS}
+  )
+  install(
+    TARGETS resnet50
+    RUNTIME DESTINATION bin
+  )
+  #
+  # image_client
+  #
+  find_package(OpenCV REQUIRED)
+  add_executable(
+    image_client
+    image_client.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_include_directories(
+    image_client
+    PRIVATE ${OpenCV_INCLUDE_DIRS}
+  )
+  target_link_libraries(
+    image_client
+    PRIVATE
+      grpcclient_static
+      httpclient_static
+      ${OpenCV_LIBS}
+  )
+  install(
+    TARGETS image_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # ensemble_image_client
+  #
+  add_executable(
+    ensemble_image_client
+    ensemble_image_client.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_link_libraries(
+    ensemble_image_client
+    PRIVATE
+      grpcclient_static
+      httpclient_static
+  )
+  install(
+    TARGETS ensemble_image_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # reuse_infer_objects_client
+  #
+  add_executable(
+    reuse_infer_objects_client
+    reuse_infer_objects_client.cc
+    $<TARGET_OBJECTS:shm-utils-library>
+  )
+  target_link_libraries(
+    reuse_infer_objects_client
+    PRIVATE
+      grpcclient_static
+      httpclient_static
+  )
+  install(
+    TARGETS reuse_infer_objects_client
+    RUNTIME DESTINATION bin
+  )
+endif() # TRITON_ENABLE_CC_HTTP AND TRITON_ENABLE_CC_GRPC
+if(TRITON_ENABLE_CC_GRPC)
+  #
+  # simple_grpc_health_metadata
+  #
+  add_executable(simple_grpc_health_metadata simple_grpc_health_metadata.cc)
+  target_link_libraries(
+    simple_grpc_health_metadata
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_health_metadata
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_model_control
+  #
+  add_executable(simple_grpc_model_control simple_grpc_model_control.cc)
+  target_link_libraries(
+    simple_grpc_model_control
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_model_control
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_infer_client
+  #
+  add_executable(simple_grpc_infer_client simple_grpc_infer_client.cc)
+  target_link_libraries(
+    simple_grpc_infer_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_keepalive_client
+  #
+  add_executable(simple_grpc_keepalive_client simple_grpc_keepalive_client.cc)
+  target_link_libraries(
+    simple_grpc_keepalive_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_keepalive_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_custom_args_client
+  #
+  add_executable(simple_grpc_custom_args_client simple_grpc_custom_args_client.cc)
+  target_link_libraries(
+    simple_grpc_custom_args_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_custom_args_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_string_infer_client
+  #
+  add_executable(simple_grpc_string_infer_client simple_grpc_string_infer_client.cc)
+  target_link_libraries(
+    simple_grpc_string_infer_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_string_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_async_infer_client
+  #
+  add_executable(simple_grpc_async_infer_client simple_grpc_async_infer_client.cc)
+  target_link_libraries(
+    simple_grpc_async_infer_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_async_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_sequence_stream_infer_client
+  #
+  add_executable(simple_grpc_sequence_stream_infer_client simple_grpc_sequence_stream_infer_client.cc)
+  target_link_libraries(
+    simple_grpc_sequence_stream_infer_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_sequence_stream_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_sequence_sync_infer_client
+  #
+  add_executable(simple_grpc_sequence_sync_infer_client simple_grpc_sequence_sync_infer_client.cc)
+  target_link_libraries(
+    simple_grpc_sequence_sync_infer_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_sequence_sync_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_shm_client
+  #
+  add_executable(
+    simple_grpc_shm_client
+    simple_grpc_shm_client.cc
+    $<TARGET_OBJECTS:shm-utils-library>
+  )
+  target_link_libraries(
+    simple_grpc_shm_client
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_shm_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_grpc_custom_repeat
+  #
+  add_executable(simple_grpc_custom_repeat simple_grpc_custom_repeat.cc)
+  target_link_libraries(
+    simple_grpc_custom_repeat
+    PRIVATE
+      grpcclient_static
+  )
+  install(
+    TARGETS simple_grpc_custom_repeat
+    RUNTIME DESTINATION bin
+  )
+  if(${TRITON_ENABLE_GPU})
+    #
+    # simple_grpc_cudashm_client
+    #
+    set(
+      SIMPLE_GRPC_CUDA_SHM_SRCS
+      simple_grpc_cudashm_client.cc
+    )
+    set(
+      SIMPLE_GRPC_CUDA_SHM_HDRS
+    )
+    add_executable(simple_grpc_cudashm_client ${SIMPLE_GRPC_CUDA_SHM_SRCS} ${SIMPLE_GRPC_CUDA_SHM_HDRS})
+    target_include_directories(simple_grpc_cudashm_client PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(
+      simple_grpc_cudashm_client
+      PRIVATE
+        grpcclient_static
+        ${CUDA_LIBRARIES}
+    )
+    install(
+      TARGETS simple_grpc_cudashm_client
+      RUNTIME DESTINATION bin
+    )
+  endif() # TRITON_ENABLE_GPU
+endif() # TRITON_ENABLE_CC_GRPC
+if(TRITON_ENABLE_CC_HTTP)
+  #
+  # simple_http_health_metadata
+  #
+  add_executable(
+    simple_http_health_metadata
+    simple_http_health_metadata.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_link_libraries(
+    simple_http_health_metadata
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_health_metadata
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_model_control
+  #
+  add_executable(
+    simple_http_model_control
+    simple_http_model_control.cc
+    $<TARGET_OBJECTS:json-utils-library>
+  )
+  target_link_libraries(
+    simple_http_model_control
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_model_control
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_infer_client
+  #
+  add_executable(simple_http_infer_client simple_http_infer_client.cc)
+  target_link_libraries(
+    simple_http_infer_client
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_string_infer_client
+  #
+  add_executable(simple_http_string_infer_client simple_http_string_infer_client.cc)
+  target_link_libraries(
+    simple_http_string_infer_client
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_string_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_async_infer_client
+  #
+  add_executable(simple_http_async_infer_client simple_http_async_infer_client.cc)
+  target_link_libraries(
+    simple_http_async_infer_client
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_async_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_sequence_sync_infer_client
+  #
+  add_executable(simple_http_sequence_sync_infer_client simple_http_sequence_sync_infer_client.cc)
+  target_link_libraries(
+    simple_http_sequence_sync_infer_client
+    PRIVATE
+      httpclient_static
+  )
+  install(
+    TARGETS simple_http_sequence_sync_infer_client
+    RUNTIME DESTINATION bin
+  )
+  #
+  # simple_http_shm_client
+  #
+  add_executable(
+    simple_http_shm_client
+    simple_http_shm_client.cc
+    $<TARGET_OBJECTS:shm-utils-library>
+  )
+  target_link_libraries(
+    simple_http_shm_client
+    PRIVATE
+      httpclient_static
+      rt
+  )
+  install(
+    TARGETS simple_http_shm_client
+    RUNTIME DESTINATION bin
+  )
+  if(${TRITON_ENABLE_GPU})
+    #
+    # simple_http_cudashm_client
+    #
+    set(
+      SIMPLE_HTTP_CUDA_SHM_SRCS
+      simple_http_cudashm_client.cc
+    )
+    set(
+      SIMPLE_HTTP_CUDA_SHM_HDRS
+    )
+    add_executable(simple_http_cudashm_client ${SIMPLE_HTTP_CUDA_SHM_SRCS} ${SIMPLE_HTTP_CUDA_SHM_HDRS})
+    target_include_directories(simple_http_cudashm_client PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(
+      simple_http_cudashm_client
+      PRIVATE
+        httpclient_static
+        ${CUDA_LIBRARIES}
+    )
+    install(
+      TARGETS simple_http_cudashm_client
+      RUNTIME DESTINATION bin
+    )
+  endif() # TRITON_ENABLE_GPU
+endif() # TRITON_ENABLE_CC_HTTP
+endif() # WIN32
--- a/src/c++/examples/ensemble_image_client.cc
+++ b/src/c++/examples/ensemble_image_client.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <dirent.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include "grpc_client.h"
+#include "http_client.h"
+#include "json_utils.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+Postprocess(
+    const std::unique_ptr<tc::InferResult> result,
+    const std::vector<std::string>& filenames, const size_t batch_size,
+    const size_t topk)
+{
+  std::string output_name("OUTPUT");
+  if (!result->RequestStatus().IsOk()) {
+    std::cerr << "inference  failed with error: " << result->RequestStatus()
+              << std::endl;
+    exit(1);
+  }
+  if (filenames.size() != batch_size) {
+    std::cerr << "expected " << batch_size << " filenames, got "
+              << filenames.size() << std::endl;
+    exit(1);
+  }
+  // Get and validate the shape and datatype
+  std::vector<int64_t> shape;
+  tc::Error err = result->Shape(output_name, &shape);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get shape for " << output_name << std::endl;
+    exit(1);
+  }
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != (int)batch_size) ||
+      (shape[1] != (int)topk)) {
+    std::cerr << "received incorrect shapes for " << output_name << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  err = result->Datatype(output_name, &datatype);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get datatype for " << output_name << std::endl;
+    exit(1);
+  }
+  // Validate datatype
+  if (datatype.compare("BYTES") != 0) {
+    std::cerr << "received incorrect datatype for " << output_name << ": "
+              << datatype << std::endl;
+    exit(1);
+  }
+  std::vector<std::string> result_data;
+  err = result->StringData(output_name, &result_data);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get data for " << output_name << std::endl;
+    exit(1);
+  }
+  if (result_data.size() != (topk * batch_size)) {
+    std::cerr << "unexpected number of strings in the result, expected "
+              << (topk * batch_size) << ", got " << result_data.size()
+              << std::endl;
+    exit(1);
+  }
+  size_t index = 0;
+  for (size_t b = 0; b < batch_size; ++b) {
+    std::cout << "Image '" << filenames[b] << "':" << std::endl;
+    for (size_t c = 0; c < topk; ++c) {
+      std::istringstream is(result_data[index]);
+      int count = 0;
+      std::string token;
+      while (getline(is, token, ':')) {
+        if (count == 0) {
+          std::cout << "    " << token;
+        } else if (count == 1) {
+          std::cout << " (" << token << ")";
+        } else if (count == 2) {
+          std::cout << " = " << token;
+        }
+        count++;
+      }
+      std::cout << std::endl;
+      index++;
+    }
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0]
+            << " [options] <image filename / image folder>" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-c <topk>" << std::endl;
+  std::cerr << "\t-i <Protocol used to communicate with inference service>"
+            << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "For -c, the <topk> classes will be returned, default is 1."
+            << std::endl;
+  std::cerr
+      << "For -i, available protocols are 'grpc' and 'http'. Default is 'http."
+      << std::endl;
+  exit(1);
+}
+union TritonClient {
+  TritonClient()
+  {
+    new (&http_client_) std::unique_ptr<tc::InferenceServerHttpClient>{};
+  }
+  ~TritonClient() {}
+  std::unique_ptr<tc::InferenceServerHttpClient> http_client_;
+  std::unique_ptr<tc::InferenceServerGrpcClient> grpc_client_;
+};
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8000");
+  std::string protocol = "http";
+  size_t topk = 1;
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vi:u:p:c:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'i':
+        protocol = optarg;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'c':
+        topk = std::atoi(optarg);
+        break;
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  if (topk <= 0) {
+    Usage(argv, "topk must be > 0");
+  }
+  // The ensemble model takes 1 input tensor with shape [ 1 ] and STRING
+  // data type and returns 1 output tensor as top k (see '-c' flag)
+  // classification result of the input.
+  std::string model_name = "preprocess_inception_ensemble";
+  // Create the inference client for the model.
+  TritonClient triton_client;
+  tc::Error err;
+  if (protocol == "http") {
+    err = tc::InferenceServerHttpClient::Create(
+        &triton_client.http_client_, url, verbose);
+  } else {
+    err = tc::InferenceServerGrpcClient::Create(
+        &triton_client.grpc_client_, url, verbose);
+  }
+  if (!err.IsOk()) {
+    std::cerr << "error: unable to create client for inference: " << err
+              << std::endl;
+    exit(1);
+  }
+  if (optind >= argc) {
+    Usage(argv, "image file or image folder must be specified");
+  }
+  if (!err.IsOk()) {
+    std::cerr << "error: unable to create inference context: " << err
+              << std::endl;
+    exit(1);
+  }
+  // Obtain a list of the image names to be processed
+  std::vector<std::string> image_filenames;
+  struct stat name_stat;
+  if (stat(argv[optind], &name_stat) != 0) {
+    std::cerr << "Failed to find '" << std::string(argv[optind])
+              << "': " << strerror(errno) << std::endl;
+    exit(1);
+  }
+  if (name_stat.st_mode & S_IFDIR) {
+    const std::string dirname = argv[optind];
+    DIR* dir_ptr = opendir(dirname.c_str());
+    struct dirent* d_ptr;
+    while ((d_ptr = readdir(dir_ptr)) != NULL) {
+      const std::string filename = d_ptr->d_name;
+      if ((filename != ".") && (filename != "..")) {
+        image_filenames.push_back(dirname + "/" + filename);
+      }
+    }
+    closedir(dir_ptr);
+  } else {
+    image_filenames.push_back(argv[optind]);
+  }
+  // Sort the filenames so that we always visit them in the same order
+  // (readdir does not guarantee any particular order).
+  std::sort(image_filenames.begin(), image_filenames.end());
+  // Read the raw image as string
+  std::vector<std::vector<std::string>> images;
+  for (const auto& fn : image_filenames) {
+    images.emplace_back();
+    auto& image_str = images.back();
+    std::ifstream file(fn);
+    file >> std::noskipws;
+    image_str.emplace_back(
+        (std::istreambuf_iterator<char>(file)),
+        std::istreambuf_iterator<char>());
+    if (image_str.back().empty()) {
+      std::cerr << "error: unable to read image file " << fn << std::endl;
+      exit(1);
+    }
+  }
+  // this client only send one request for simplicity. So the maximum number
+  // of the images to be processed is limited by the maximum batch size
+  size_t batch_size = 0;
+  if (protocol == "http") {
+    std::string model_config;
+    err = triton_client.http_client_->ModelConfig(&model_config, model_name);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model config: " << err << std::endl;
+    }
+    rapidjson::Document model_config_json;
+    err = tc::ParseJson(&model_config_json, model_config);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to parse model config: " << err << std::endl;
+    }
+    const auto bs_itr = model_config_json.FindMember("max_batch_size");
+    if (bs_itr != model_config_json.MemberEnd()) {
+      batch_size = bs_itr->value.GetInt();
+    }
+  } else {
+    inference::ModelConfigResponse model_config;
+    err = triton_client.grpc_client_->ModelConfig(&model_config, model_name);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model config: " << err << std::endl;
+    }
+    batch_size = model_config.config().max_batch_size();
+  }
+  if (images.size() > batch_size) {
+    std::cerr << "The number of images exceeds maximum batch size, only the"
+              << " first " << batch_size << " images, sorted by name"
+              << " alphabetically, will be processed" << std::endl;
+  }
+  batch_size = (images.size() < batch_size) ? images.size() : batch_size;
+  // Initialize the inputs with the data.
+  tc::InferInput* input;
+  std::vector<int64_t> shape{(int64_t)batch_size, 1};
+  err = tc::InferInput::Create(&input, "INPUT", shape, "BYTES");
+  if (!err.IsOk()) {
+    std::cerr << "unable to get input: " << err << std::endl;
+    exit(1);
+  }
+  std::shared_ptr<tc::InferInput> input_ptr(input);
+  tc::InferRequestedOutput* output;
+  // Set the number of classification expected
+  err = tc::InferRequestedOutput::Create(&output, "OUTPUT", topk);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get output: " << err << std::endl;
+    exit(1);
+  }
+  std::shared_ptr<tc::InferRequestedOutput> output_ptr(output);
+  std::vector<tc::InferInput*> inputs = {input_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {output_ptr.get()};
+  tc::InferOptions options(model_name);
+  FAIL_IF_ERR(input_ptr->Reset(), "unable to reset INPUT");
+  for (size_t i = 0; i < batch_size; i++) {
+    FAIL_IF_ERR(
+        input_ptr->AppendFromString(images[i]), "unable to set data for INPUT");
+  }
+  // Send inference request to the inference server.
+  tc::InferResult* results;
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->Infer(&results, options, inputs, outputs),
+        "unable to run model");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->Infer(&results, options, inputs, outputs),
+        "unable to run model");
+  }
+  std::unique_ptr<tc::InferResult> results_ptr;
+  results_ptr.reset(results);
+  // Print classification results
+  Postprocess(std::move(results_ptr), image_filenames, batch_size, topk);
+  return 0;
+}
--- a/src/c++/examples/image_client.cc
+++ b/src/c++/examples/image_client.cc
+// Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <dirent.h>
+#include <getopt.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include <condition_variable>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <mutex>
+#include <opencv2/core/version.hpp>
+#include <queue>
+#include <string>
+#include "grpc_client.h"
+#include "http_client.h"
+#include "json_utils.h"
+#if CV_MAJOR_VERSION == 2
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#elif CV_MAJOR_VERSION >= 3
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#endif
+#if CV_MAJOR_VERSION == 4
+#define GET_TRANSFORMATION_CODE(x) cv::COLOR_##x
+#else
+#define GET_TRANSFORMATION_CODE(x) CV_##x
+#endif
+namespace tc = triton::client;
+namespace {
+enum ScaleType { NONE = 0, VGG = 1, INCEPTION = 2 };
+enum ProtocolType { HTTP = 0, GRPC = 1 };
+struct ModelInfo {
+  std::string output_name_;
+  std::string input_name_;
+  std::string input_datatype_;
+  // The shape of the input
+  int input_c_;
+  int input_h_;
+  int input_w_;
+  // The format of the input
+  std::string input_format_;
+  int type1_;
+  int type3_;
+  int max_batch_size_;
+};
+void
+Preprocess(
+    const cv::Mat& img, const std::string& format, int img_type1, int img_type3,
+    size_t img_channels, const cv::Size& img_size, const ScaleType scale,
+    std::vector<uint8_t>* input_data)
+{
+  // Image channels are in BGR order. Currently model configuration
+  // data doesn't provide any information as to the expected channel
+  // orderings (like RGB, BGR). We are going to assume that RGB is the
+  // most likely ordering and so change the channels to that ordering.
+  cv::Mat sample;
+  if ((img.channels() == 3) && (img_channels == 1)) {
+    cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(BGR2GRAY));
+  } else if ((img.channels() == 4) && (img_channels == 1)) {
+    cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(BGRA2GRAY));
+  } else if ((img.channels() == 3) && (img_channels == 3)) {
+    cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(BGR2RGB));
+  } else if ((img.channels() == 4) && (img_channels == 3)) {
+    cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(BGRA2RGB));
+  } else if ((img.channels() == 1) && (img_channels == 3)) {
+    cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(GRAY2RGB));
+  } else {
+    std::cerr << "unexpected number of channels " << img.channels()
+              << " in input image, model expects " << img_channels << "."
+              << std::endl;
+    exit(1);
+  }
+  cv::Mat sample_resized;
+  if (sample.size() != img_size) {
+    cv::resize(sample, sample_resized, img_size);
+  } else {
+    sample_resized = sample;
+  }
+  cv::Mat sample_type;
+  sample_resized.convertTo(
+      sample_type, (img_channels == 3) ? img_type3 : img_type1);
+  cv::Mat sample_final;
+  if (scale == ScaleType::INCEPTION) {
+    if (img_channels == 1) {
+      sample_final = sample_type.mul(cv::Scalar(1 / 127.5));
+      sample_final = sample_final - cv::Scalar(1.0);
+    } else {
+      sample_final =
+          sample_type.mul(cv::Scalar(1 / 127.5, 1 / 127.5, 1 / 127.5));
+      sample_final = sample_final - cv::Scalar(1.0, 1.0, 1.0);
+    }
+  } else if (scale == ScaleType::VGG) {
+    if (img_channels == 1) {
+      sample_final = sample_type - cv::Scalar(128);
+    } else {
+      sample_final = sample_type - cv::Scalar(123, 117, 104);
+    }
+  } else {
+    sample_final = sample_type;
+  }
+  // Allocate a buffer to hold all image elements.
+  size_t img_byte_size = sample_final.total() * sample_final.elemSize();
+  size_t pos = 0;
+  input_data->resize(img_byte_size);
+  // For NHWC format Mat is already in the correct order but need to
+  // handle both cases of data being contiguous or not.
+  if (format.compare("FORMAT_NHWC") == 0) {
+    if (sample_final.isContinuous()) {
+      memcpy(&((*input_data)[0]), sample_final.datastart, img_byte_size);
+      pos = img_byte_size;
+    } else {
+      size_t row_byte_size = sample_final.cols * sample_final.elemSize();
+      for (int r = 0; r < sample_final.rows; ++r) {
+        memcpy(
+            &((*input_data)[pos]), sample_final.ptr<uint8_t>(r), row_byte_size);
+        pos += row_byte_size;
+      }
+    }
+  } else {
+    // (format.compare("FORMAT_NCHW") == 0)
+    //
+    // For CHW formats must split out each channel from the matrix and
+    // order them as BBBB...GGGG...RRRR. To do this split the channels
+    // of the image directly into 'input_data'. The BGR channels are
+    // backed by the 'input_data' vector so that ends up with CHW
+    // order of the data.
+    std::vector<cv::Mat> input_bgr_channels;
+    for (size_t i = 0; i < img_channels; ++i) {
+      input_bgr_channels.emplace_back(
+          img_size.height, img_size.width, img_type1, &((*input_data)[pos]));
+      pos += input_bgr_channels.back().total() *
+             input_bgr_channels.back().elemSize();
+    }
+    cv::split(sample_final, input_bgr_channels);
+  }
+  if (pos != img_byte_size) {
+    std::cerr << "unexpected total size of channels " << pos << ", expecting "
+              << img_byte_size << std::endl;
+    exit(1);
+  }
+}
+void
+Postprocess(
+    const std::unique_ptr<tc::InferResult> result,
+    const std::vector<std::string>& filenames, const size_t batch_size,
+    const std::string& output_name, const size_t topk, const bool batching)
+{
+  if (!result->RequestStatus().IsOk()) {
+    std::cerr << "inference  failed with error: " << result->RequestStatus()
+              << std::endl;
+    exit(1);
+  }
+  if (filenames.size() != batch_size) {
+    std::cerr << "expected " << batch_size << " filenames, got "
+              << filenames.size() << std::endl;
+    exit(1);
+  }
+  // Get and validate the shape and datatype
+  std::vector<int64_t> shape;
+  tc::Error err = result->Shape(output_name, &shape);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get shape for " << output_name << std::endl;
+    exit(1);
+  }
+  // Validate shape. Special handling for non-batch model
+  if (!batching) {
+    if ((shape.size() != 1) || (shape[0] != (int)topk)) {
+      std::cerr << "received incorrect shape for " << output_name << std::endl;
+      exit(1);
+    }
+  } else {
+    if ((shape.size() != 2) || (shape[0] != (int)batch_size) ||
+        (shape[1] != (int)topk)) {
+      std::cerr << "received incorrect shape for " << output_name << std::endl;
+      exit(1);
+    }
+  }
+  std::string datatype;
+  err = result->Datatype(output_name, &datatype);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get datatype for " << output_name << std::endl;
+    exit(1);
+  }
+  // Validate datatype
+  if (datatype.compare("BYTES") != 0) {
+    std::cerr << "received incorrect datatype for " << output_name << ": "
+              << datatype << std::endl;
+    exit(1);
+  }
+  std::vector<std::string> result_data;
+  err = result->StringData(output_name, &result_data);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get data for " << output_name << std::endl;
+    exit(1);
+  }
+  if (result_data.size() != (topk * batch_size)) {
+    std::cerr << "unexpected number of strings in the result, expected "
+              << (topk * batch_size) << ", got " << result_data.size()
+              << std::endl;
+    exit(1);
+  }
+  size_t index = 0;
+  for (size_t b = 0; b < batch_size; ++b) {
+    std::cout << "Image '" << filenames[b] << "':" << std::endl;
+    for (size_t c = 0; c < topk; ++c) {
+      std::istringstream is(result_data[index]);
+      int count = 0;
+      std::string token;
+      while (getline(is, token, ':')) {
+        if (count == 0) {
+          std::cout << "    " << token;
+        } else if (count == 1) {
+          std::cout << " (" << token << ")";
+        } else if (count == 2) {
+          std::cout << " = " << token;
+        }
+        count++;
+      }
+      std::cout << std::endl;
+      index++;
+    }
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0]
+            << " [options] <image filename / image folder>" << std::endl;
+  std::cerr << "    Note that image folder should only contain image files."
+            << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-a" << std::endl;
+  std::cerr << "\t--streaming" << std::endl;
+  std::cerr << "\t-b <batch size>" << std::endl;
+  std::cerr << "\t-c <topk>" << std::endl;
+  std::cerr << "\t-s <NONE|INCEPTION|VGG>" << std::endl;
+  std::cerr << "\t-p <preprocessed output filename>" << std::endl;
+  std::cerr << "\t-m <model name>" << std::endl;
+  std::cerr << "\t-x <model version>" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-i <Protocol used to communicate with inference service>"
+            << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr << "If -a is specified then asynchronous client API will be used. "
+            << "Default is to use the synchronous API." << std::endl;
+  std::cerr << "The --streaming flag is only valid with gRPC protocol."
+            << std::endl;
+  std::cerr
+      << "For -b, a single image will be replicated and sent in a batch"
+      << std::endl
+      << "        of the specified size. A directory of images will be grouped"
+      << std::endl
+      << "        into batches. Default is 1." << std::endl;
+  std::cerr << "For -c, the <topk> classes will be returned, default is 1."
+            << std::endl;
+  std::cerr << "For -s, specify the type of pre-processing scaling that"
+            << std::endl
+            << "        should be performed on the image, default is NONE."
+            << std::endl
+            << "    INCEPTION: scale each pixel RGB value to [-1.0, 1.0)."
+            << std::endl
+            << "    VGG: subtract mean BGR value (123, 117, 104) from"
+            << std::endl
+            << "         each pixel." << std::endl;
+  std::cerr
+      << "If -x is not specified the most recent version (that is, the highest "
+      << "numbered version) of the model will be used." << std::endl;
+  std::cerr << "For -p, it generates file only if image file is specified."
+            << std::endl;
+  std::cerr << "For -u, the default server URL is localhost:8000." << std::endl;
+  std::cerr << "For -i, available protocols are gRPC and HTTP. Default is HTTP."
+            << std::endl;
+  std::cerr
+      << "For -H, the header will be added to HTTP requests (ignored for GRPC "
+         "requests). The header must be specified as 'Header:Value'. -H may be "
+         "specified multiple times to add multiple headers."
+      << std::endl;
+  std::cerr << std::endl;
+  exit(1);
+}
+ScaleType
+ParseScale(const std::string& str)
+{
+  if (str == "NONE") {
+    return ScaleType::NONE;
+  } else if (str == "INCEPTION") {
+    return ScaleType::INCEPTION;
+  } else if (str == "VGG") {
+    return ScaleType::VGG;
+  }
+  std::cerr << "unexpected scale type \"" << str
+            << "\", expecting NONE, INCEPTION or VGG" << std::endl;
+  exit(1);
+  return ScaleType::NONE;
+}
+ProtocolType
+ParseProtocol(const std::string& str)
+{
+  std::string protocol(str);
+  std::transform(protocol.begin(), protocol.end(), protocol.begin(), ::tolower);
+  if (protocol == "http") {
+    return ProtocolType::HTTP;
+  } else if (protocol == "grpc") {
+    return ProtocolType::GRPC;
+  }
+  std::cerr << "unexpected protocol type \"" << str
+            << "\", expecting HTTP or gRPC" << std::endl;
+  exit(1);
+  return ProtocolType::HTTP;
+}
+bool
+ParseType(const std::string& dtype, int* type1, int* type3)
+{
+  if (dtype.compare("UINT8") == 0) {
+    *type1 = CV_8UC1;
+    *type3 = CV_8UC3;
+  } else if (dtype.compare("INT8") == 0) {
+    *type1 = CV_8SC1;
+    *type3 = CV_8SC3;
+  } else if (dtype.compare("UINT16") == 0) {
+    *type1 = CV_16UC1;
+    *type3 = CV_16UC3;
+  } else if (dtype.compare("INT16") == 0) {
+    *type1 = CV_16SC1;
+    *type3 = CV_16SC3;
+  } else if (dtype.compare("INT32") == 0) {
+    *type1 = CV_32SC1;
+    *type3 = CV_32SC3;
+  } else if (dtype.compare("FP32") == 0) {
+    *type1 = CV_32FC1;
+    *type3 = CV_32FC3;
+  } else if (dtype.compare("FP64") == 0) {
+    *type1 = CV_64FC1;
+    *type3 = CV_64FC3;
+  } else {
+    return false;
+  }
+  return true;
+}
+void
+ParseModelGrpc(
+    const inference::ModelMetadataResponse& model_metadata,
+    const inference::ModelConfigResponse& model_config, const size_t batch_size,
+    ModelInfo* model_info)
+{
+  if (model_metadata.inputs().size() != 1) {
+    std::cerr << "expecting 1 input, got " << model_metadata.inputs().size()
+              << std::endl;
+    exit(1);
+  }
+  if (model_metadata.outputs().size() != 1) {
+    std::cerr << "expecting 1 output, got " << model_metadata.outputs().size()
+              << std::endl;
+    exit(1);
+  }
+  if (model_config.config().input().size() != 1) {
+    std::cerr << "expecting 1 input in model configuration, got "
+              << model_config.config().input().size() << std::endl;
+    exit(1);
+  }
+  auto input_metadata = model_metadata.inputs(0);
+  auto input_config = model_config.config().input(0);
+  auto output_metadata = model_metadata.outputs(0);
+  if (output_metadata.datatype().compare("FP32") != 0) {
+    std::cerr << "expecting output datatype to be FP32, model '"
+              << model_metadata.name() << "' output type is '"
+              << output_metadata.datatype() << "'" << std::endl;
+    exit(1);
+  }
+  model_info->max_batch_size_ = model_config.config().max_batch_size();
+  // Model specifying maximum batch size of 0 indicates that batching
+  // is not supported and so the input tensors do not expect a "N"
+  // dimension (and 'batch_size' should be 1 so that only a single
+  // image instance is inferred at a time).
+  if (model_info->max_batch_size_ == 0) {
+    if (batch_size != 1) {
+      std::cerr << "batching not supported for model \""
+                << model_metadata.name() << "\"" << std::endl;
+      exit(1);
+    }
+  } else {
+    //  model_info->max_batch_size_ > 0
+    if (batch_size > (size_t)model_info->max_batch_size_) {
+      std::cerr << "expecting batch size <= " << model_info->max_batch_size_
+                << " for model '" << model_metadata.name() << "'" << std::endl;
+      exit(1);
+    }
+  }
+  // Output is expected to be a vector. But allow any number of
+  // dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10
+  // }, { 10, 1, 1 } are all ok).
+  bool output_batch_dim = (model_info->max_batch_size_ > 0);
+  size_t non_one_cnt = 0;
+  for (const auto dim : output_metadata.shape()) {
+    if (output_batch_dim) {
+      output_batch_dim = false;
+    } else if (dim == -1) {
+      std::cerr << "variable-size dimension in model output not supported"
+                << std::endl;
+      exit(1);
+    } else if (dim > 1) {
+      non_one_cnt += 1;
+      if (non_one_cnt > 1) {
+        std::cerr << "expecting model output to be a vector" << std::endl;
+        exit(1);
+      }
+    }
+  }
+  // Model input must have 3 dims, either CHW or HWC (not counting the
+  // batch dimension), either CHW or HWC
+  const bool input_batch_dim = (model_info->max_batch_size_ > 0);
+  const int expected_input_dims = 3 + (input_batch_dim ? 1 : 0);
+  if (input_metadata.shape().size() != expected_input_dims) {
+    std::cerr << "expecting input to have " << expected_input_dims
+              << " dimensions, model '" << model_metadata.name()
+              << "' input has " << input_metadata.shape().size() << std::endl;
+    exit(1);
+  }
+  if ((input_config.format() != inference::ModelInput::FORMAT_NCHW) &&
+      (input_config.format() != inference::ModelInput::FORMAT_NHWC)) {
+    std::cerr
+        << "unexpected input format "
+        << inference::ModelInput_Format_Name(input_config.format())
+        << ", expecting "
+        << inference::ModelInput_Format_Name(inference::ModelInput::FORMAT_NHWC)
+        << " or "
+        << inference::ModelInput_Format_Name(inference::ModelInput::FORMAT_NCHW)
+        << std::endl;
+    exit(1);
+  }
+  model_info->output_name_ = output_metadata.name();
+  model_info->input_name_ = input_metadata.name();
+  model_info->input_datatype_ = input_metadata.datatype();
+  if (input_config.format() == inference::ModelInput::FORMAT_NHWC) {
+    model_info->input_format_ = "FORMAT_NHWC";
+    model_info->input_h_ = input_metadata.shape(input_batch_dim ? 1 : 0);
+    model_info->input_w_ = input_metadata.shape(input_batch_dim ? 2 : 1);
+    model_info->input_c_ = input_metadata.shape(input_batch_dim ? 3 : 2);
+  } else {
+    model_info->input_format_ = "FORMAT_NCHW";
+    model_info->input_c_ = input_metadata.shape(input_batch_dim ? 1 : 0);
+    model_info->input_h_ = input_metadata.shape(input_batch_dim ? 2 : 1);
+    model_info->input_w_ = input_metadata.shape(input_batch_dim ? 3 : 2);
+  }
+  if (!ParseType(
+          model_info->input_datatype_, &(model_info->type1_),
+          &(model_info->type3_))) {
+    std::cerr << "unexpected input datatype '" << model_info->input_datatype_
+              << "' for model \"" << model_metadata.name() << std::endl;
+    exit(1);
+  }
+}
+void
+ParseModelHttp(
+    const rapidjson::Document& model_metadata,
+    const rapidjson::Document& model_config, const size_t batch_size,
+    ModelInfo* model_info)
+{
+  const auto& input_itr = model_metadata.FindMember("inputs");
+  size_t input_count = 0;
+  if (input_itr != model_metadata.MemberEnd()) {
+    input_count = input_itr->value.Size();
+  }
+  if (input_count != 1) {
+    std::cerr << "expecting 1 input, got " << input_count << std::endl;
+    exit(1);
+  }
+  const auto& output_itr = model_metadata.FindMember("outputs");
+  size_t output_count = 0;
+  if (output_itr != model_metadata.MemberEnd()) {
+    output_count = output_itr->value.Size();
+  }
+  if (output_count != 1) {
+    std::cerr << "expecting 1 output, got " << output_count << std::endl;
+    exit(1);
+  }
+  const auto& input_config_itr = model_config.FindMember("input");
+  input_count = 0;
+  if (input_config_itr != model_config.MemberEnd()) {
+    input_count = input_config_itr->value.Size();
+  }
+  if (input_count != 1) {
+    std::cerr << "expecting 1 input in model configuration, got " << input_count
+              << std::endl;
+    exit(1);
+  }
+  const auto& input_metadata = *input_itr->value.Begin();
+  const auto& input_config = *input_config_itr->value.Begin();
+  const auto& output_metadata = *output_itr->value.Begin();
+  const auto& output_dtype_itr = output_metadata.FindMember("datatype");
+  if (output_dtype_itr == output_metadata.MemberEnd()) {
+    std::cerr << "output missing datatype in the metadata for model'"
+              << model_metadata["name"].GetString() << "'" << std::endl;
+    exit(1);
+  }
+  auto datatype = std::string(
+      output_dtype_itr->value.GetString(),
+      output_dtype_itr->value.GetStringLength());
+  if (datatype.compare("FP32") != 0) {
+    std::cerr << "expecting output datatype to be FP32, model '"
+              << model_metadata["name"].GetString() << "' output type is '"
+              << datatype << "'" << std::endl;
+    exit(1);
+  }
+  int max_batch_size = 0;
+  const auto bs_itr = model_config.FindMember("max_batch_size");
+  if (bs_itr != model_config.MemberEnd()) {
+    max_batch_size = bs_itr->value.GetUint();
+  }
+  model_info->max_batch_size_ = max_batch_size;
+  // Model specifying maximum batch size of 0 indicates that batching
+  // is not supported and so the input tensors do not expect a "N"
+  // dimension (and 'batch_size' should be 1 so that only a single
+  // image instance is inferred at a time).
+  if (max_batch_size == 0) {
+    if (batch_size != 1) {
+      std::cerr << "batching not supported for model '"
+                << model_metadata["name"].GetString() << "'" << std::endl;
+      exit(1);
+    }
+  } else {
+    // max_batch_size > 0
+    if (batch_size > (size_t)max_batch_size) {
+      std::cerr << "expecting batch size <= " << max_batch_size
+                << " for model '" << model_metadata["name"].GetString() << "'"
+                << std::endl;
+      exit(1);
+    }
+  }
+  // Output is expected to be a vector. But allow any number of
+  // dimensions as long as all but 1 is size 1 (e.g. { 10 }, { 1, 10
+  // }, { 10, 1, 1 } are all ok).
+  bool output_batch_dim = (max_batch_size > 0);
+  size_t non_one_cnt = 0;
+  const auto output_shape_itr = output_metadata.FindMember("shape");
+  if (output_shape_itr != output_metadata.MemberEnd()) {
+    const rapidjson::Value& shape_json = output_shape_itr->value;
+    for (rapidjson::SizeType i = 0; i < shape_json.Size(); i++) {
+      if (output_batch_dim) {
+        output_batch_dim = false;
+      } else if (shape_json[i].GetInt() == -1) {
+        std::cerr << "variable-size dimension in model output not supported"
+                  << std::endl;
+        exit(1);
+      } else if (shape_json[i].GetInt() > 1) {
+        non_one_cnt += 1;
+        if (non_one_cnt > 1) {
+          std::cerr << "expecting model output to be a vector" << std::endl;
+          exit(1);
+        }
+      }
+    }
+  } else {
+    std::cerr << "output missing shape in the metadata for model'"
+              << model_metadata["name"].GetString() << "'" << std::endl;
+    exit(1);
+  }
+  // Model input must have 3 dims, either CHW or HWC (not counting the
+  // batch dimension), either CHW or HWC
+  const bool input_batch_dim = (max_batch_size > 0);
+  const size_t expected_input_dims = 3 + (input_batch_dim ? 1 : 0);
+  const auto input_shape_itr = input_metadata.FindMember("shape");
+  if (input_shape_itr != input_metadata.MemberEnd()) {
+    if (input_shape_itr->value.Size() != expected_input_dims) {
+      std::cerr << "expecting input to have " << expected_input_dims
+                << " dimensions, model '" << model_metadata["name"].GetString()
+                << "' input has " << input_shape_itr->value.Size() << std::endl;
+      exit(1);
+    }
+  } else {
+    std::cerr << "input missing shape in the metadata for model'"
+              << model_metadata["name"].GetString() << "'" << std::endl;
+    exit(1);
+  }
+  model_info->input_format_ = std::string(
+      input_config["format"].GetString(),
+      input_config["format"].GetStringLength());
+  if ((model_info->input_format_.compare("FORMAT_NCHW") != 0) &&
+      (model_info->input_format_.compare("FORMAT_NHWC") != 0)) {
+    std::cerr << "unexpected input format " << model_info->input_format_
+              << ", expecting FORMAT_NCHW or FORMAT_NHWC" << std::endl;
+    exit(1);
+  }
+  model_info->output_name_ = std::string(
+      output_metadata["name"].GetString(),
+      output_metadata["name"].GetStringLength());
+  model_info->input_name_ = std::string(
+      input_metadata["name"].GetString(),
+      input_metadata["name"].GetStringLength());
+  model_info->input_datatype_ = std::string(
+      input_metadata["datatype"].GetString(),
+      input_metadata["datatype"].GetStringLength());
+  if (model_info->input_format_.compare("FORMAT_NHWC") == 0) {
+    model_info->input_h_ =
+        input_shape_itr->value[input_batch_dim ? 1 : 0].GetInt();
+    model_info->input_w_ =
+        input_shape_itr->value[input_batch_dim ? 2 : 1].GetInt();
+    model_info->input_c_ =
+        input_shape_itr->value[input_batch_dim ? 3 : 2].GetInt();
+  } else {
+    model_info->input_c_ =
+        input_shape_itr->value[input_batch_dim ? 1 : 0].GetInt();
+    model_info->input_h_ =
+        input_shape_itr->value[input_batch_dim ? 2 : 1].GetInt();
+    model_info->input_w_ =
+        input_shape_itr->value[input_batch_dim ? 3 : 2].GetInt();
+  }
+  if (!ParseType(
+          model_info->input_datatype_, &(model_info->type1_),
+          &(model_info->type3_))) {
+    std::cerr << "unexpected input datatype '" << model_info->input_datatype_
+              << "' for model \"" << model_metadata["name"].GetString()
+              << std::endl;
+    exit(1);
+  }
+}
+void
+FileToInputData(
+    const std::string& filename, size_t c, size_t h, size_t w,
+    const std::string& format, int type1, int type3, ScaleType scale,
+    std::vector<uint8_t>* input_data)
+{
+  // Load the specified image.
+  std::ifstream file(filename);
+  std::vector<char> data;
+  file >> std::noskipws;
+  std::copy(
+      std::istream_iterator<char>(file), std::istream_iterator<char>(),
+      std::back_inserter(data));
+  if (data.empty()) {
+    std::cerr << "error: unable to read image file " << filename << std::endl;
+    exit(1);
+  }
+  cv::Mat img = imdecode(cv::Mat(data), 1);
+  if (img.empty()) {
+    std::cerr << "error: unable to decode image " << filename << std::endl;
+    exit(1);
+  }
+  // Pre-process the image to match input size expected by the model.
+  Preprocess(img, format, type1, type3, c, cv::Size(w, h), scale, input_data);
+}
+union TritonClient {
+  TritonClient()
+  {
+    new (&http_client_) std::unique_ptr<tc::InferenceServerHttpClient>{};
+  }
+  ~TritonClient() {}
+  std::unique_ptr<tc::InferenceServerHttpClient> http_client_;
+  std::unique_ptr<tc::InferenceServerGrpcClient> grpc_client_;
+};
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  bool async = false;
+  bool streaming = false;
+  int batch_size = 1;
+  int topk = 1;
+  ScaleType scale = ScaleType::NONE;
+  std::string preprocess_output_filename;
+  std::string model_name;
+  std::string model_version = "";
+  std::string url("localhost:8000");
+  ProtocolType protocol = ProtocolType::HTTP;
+  tc::Headers http_headers;
+  static struct option long_options[] = {{"streaming", 0, 0, 0}, {0, 0, 0, 0}};
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt_long(
+              argc, argv, "vau:m:x:b:c:s:p:i:H:", long_options, NULL)) != -1) {
+    switch (opt) {
+      case 0:
+        streaming = true;
+        break;
+      case 'v':
+        verbose = true;
+        break;
+      case 'a':
+        async = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'm':
+        model_name = optarg;
+        break;
+      case 'x':
+        model_version = optarg;
+        break;
+      case 'b':
+        batch_size = std::atoi(optarg);
+        break;
+      case 'c':
+        topk = std::atoi(optarg);
+        break;
+      case 's':
+        scale = ParseScale(optarg);
+        break;
+      case 'p':
+        preprocess_output_filename = optarg;
+        break;
+      case 'i':
+        protocol = ParseProtocol(optarg);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  if (model_name.empty()) {
+    Usage(argv, "-m flag must be specified");
+  }
+  if (batch_size <= 0) {
+    Usage(argv, "batch size must be > 0");
+  }
+  if (topk <= 0) {
+    Usage(argv, "topk must be > 0");
+  }
+  if (optind >= argc) {
+    Usage(argv, "image file or image folder must be specified");
+  }
+  if (streaming && (protocol != ProtocolType::GRPC)) {
+    Usage(argv, "Streaming is only allowed with gRPC protocol");
+  }
+  if (streaming && (!async)) {
+    Usage(argv, "Only async operation is supported in streaming");
+  }
+  if (!http_headers.empty() && (protocol != ProtocolType::HTTP)) {
+    std::cerr << "WARNING: HTTP headers specified with -H are ignored when "
+                 "using non-HTTP protocol."
+              << std::endl;
+  }
+  // Create the inference client for the server. From it
+  // extract and validate that the model meets the requirements for
+  // image classification.
+  TritonClient triton_client;
+  tc::Error err;
+  if (protocol == ProtocolType::HTTP) {
+    err = tc::InferenceServerHttpClient::Create(
+        &triton_client.http_client_, url, verbose);
+  } else {
+    err = tc::InferenceServerGrpcClient::Create(
+        &triton_client.grpc_client_, url, verbose);
+  }
+  if (!err.IsOk()) {
+    std::cerr << "error: unable to create client for inference: " << err
+              << std::endl;
+    exit(1);
+  }
+  ModelInfo model_info;
+  if (protocol == ProtocolType::HTTP) {
+    std::string model_metadata;
+    err = triton_client.http_client_->ModelMetadata(
+        &model_metadata, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model metadata: " << err << std::endl;
+    }
+    rapidjson::Document model_metadata_json;
+    err = tc::ParseJson(&model_metadata_json, model_metadata);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to parse model metadata: " << err
+                << std::endl;
+    }
+    std::string model_config;
+    err = triton_client.http_client_->ModelConfig(
+        &model_config, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model config: " << err << std::endl;
+    }
+    rapidjson::Document model_config_json;
+    err = tc::ParseJson(&model_config_json, model_config);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to parse model config: " << err << std::endl;
+    }
+    ParseModelHttp(
+        model_metadata_json, model_config_json, batch_size, &model_info);
+  } else {
+    inference::ModelMetadataResponse model_metadata;
+    err = triton_client.grpc_client_->ModelMetadata(
+        &model_metadata, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model metadata: " << err << std::endl;
+    }
+    inference::ModelConfigResponse model_config;
+    err = triton_client.grpc_client_->ModelConfig(
+        &model_config, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+      std::cerr << "error: failed to get model config: " << err << std::endl;
+    }
+    ParseModelGrpc(model_metadata, model_config, batch_size, &model_info);
+  }
+  // Collect the names of the image(s).
+  std::vector<std::string> image_filenames;
+  struct stat name_stat;
+  if (stat(argv[optind], &name_stat) != 0) {
+    std::cerr << "Failed to find '" << std::string(argv[optind])
+              << "': " << strerror(errno) << std::endl;
+    exit(1);
+  }
+  if (name_stat.st_mode & S_IFDIR) {
+    const std::string dirname = argv[optind];
+    DIR* dir_ptr = opendir(dirname.c_str());
+    struct dirent* d_ptr;
+    while ((d_ptr = readdir(dir_ptr)) != NULL) {
+      const std::string filename = d_ptr->d_name;
+      if ((filename != ".") && (filename != "..")) {
+        image_filenames.push_back(dirname + "/" + filename);
+      }
+    }
+    closedir(dir_ptr);
+  } else {
+    image_filenames.push_back(argv[optind]);
+  }
+  // Sort the filenames so that we always visit them in the same order
+  // (readdir does not guarantee any particular order).
+  std::sort(image_filenames.begin(), image_filenames.end());
+  // Preprocess the images into input data according to model
+  // requirements
+  std::vector<std::vector<uint8_t>> image_data;
+  for (const auto& fn : image_filenames) {
+    image_data.emplace_back();
+    FileToInputData(
+        fn, model_info.input_c_, model_info.input_h_, model_info.input_w_,
+        model_info.input_format_, model_info.type1_, model_info.type3_, scale,
+        &(image_data.back()));
+    if ((image_data.size() == 1) && !preprocess_output_filename.empty()) {
+      std::ofstream output_file(preprocess_output_filename);
+      std::ostream_iterator<uint8_t> output_iterator(output_file);
+      std::copy(image_data[0].begin(), image_data[0].end(), output_iterator);
+    }
+  }
+  std::vector<int64_t> shape;
+  // Include the batch dimension if required
+  if (model_info.max_batch_size_ != 0) {
+    shape.push_back(batch_size);
+  }
+  if (model_info.input_format_.compare("FORMAT_NHWC") == 0) {
+    shape.push_back(model_info.input_h_);
+    shape.push_back(model_info.input_w_);
+    shape.push_back(model_info.input_c_);
+  } else {
+    shape.push_back(model_info.input_c_);
+    shape.push_back(model_info.input_h_);
+    shape.push_back(model_info.input_w_);
+  }
+  // Initialize the inputs with the data.
+  tc::InferInput* input;
+  err = tc::InferInput::Create(
+      &input, model_info.input_name_, shape, model_info.input_datatype_);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get input: " << err << std::endl;
+    exit(1);
+  }
+  std::shared_ptr<tc::InferInput> input_ptr(input);
+  tc::InferRequestedOutput* output;
+  // Set the number of classification expected
+  err =
+      tc::InferRequestedOutput::Create(&output, model_info.output_name_, topk);
+  if (!err.IsOk()) {
+    std::cerr << "unable to get output: " << err << std::endl;
+    exit(1);
+  }
+  std::shared_ptr<tc::InferRequestedOutput> output_ptr(output);
+  std::vector<tc::InferInput*> inputs = {input_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {output_ptr.get()};
+  // Configure context for 'batch_size' and 'topk'
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  // Send requests of 'batch_size' images. If the number of images
+  // isn't an exact multiple of 'batch_size' then just start over with
+  // the first images until the batch is filled.
+  //
+  // Number of requests sent = ceil(number of images / batch_size)
+  std::vector<std::unique_ptr<tc::InferResult>> results;
+  std::vector<std::vector<std::string>> result_filenames;
+  size_t image_idx = 0;
+  size_t done_cnt = 0;
+  size_t sent_count = 0;
+  bool last_request = false;
+  std::mutex mtx;
+  std::condition_variable cv;
+  auto callback_func = [&](tc::InferResult* result) {
+    {
+      // Defer the response retrieval to main thread
+      std::lock_guard<std::mutex> lk(mtx);
+      results.emplace_back(result);
+      done_cnt++;
+    }
+    cv.notify_all();
+  };
+  if (streaming) {
+    err = triton_client.grpc_client_->StartStream(
+        callback_func, true /* enable_stats */, 0 /* stream_timeout */,
+        http_headers);
+    if (!err.IsOk()) {
+      std::cerr << "failed to establish the stream: " << err << std::endl;
+    }
+  }
+  while (!last_request) {
+    // Reset the input for new request.
+    err = input_ptr->Reset();
+    if (!err.IsOk()) {
+      std::cerr << "failed resetting input: " << err << std::endl;
+      exit(1);
+    }
+    // Set input to be the next 'batch_size' images (preprocessed).
+    std::vector<std::string> input_filenames;
+    for (int idx = 0; idx < batch_size; ++idx) {
+      input_filenames.push_back(image_filenames[image_idx]);
+      err = input_ptr->AppendRaw(image_data[image_idx]);
+      if (!err.IsOk()) {
+        std::cerr << "failed setting input: " << err << std::endl;
+        exit(1);
+      }
+      image_idx = (image_idx + 1) % image_data.size();
+      if (image_idx == 0) {
+        last_request = true;
+      }
+    }
+    result_filenames.emplace_back(std::move(input_filenames));
+    options.request_id_ = std::to_string(sent_count);
+    // Send request.
+    if (!async) {
+      tc::InferResult* result;
+      if (protocol == ProtocolType::HTTP) {
+        err = triton_client.http_client_->Infer(
+            &result, options, inputs, outputs, http_headers);
+      } else {
+        err = triton_client.grpc_client_->Infer(
+            &result, options, inputs, outputs, http_headers);
+      }
+      if (!err.IsOk()) {
+        std::cerr << "failed sending synchronous infer request: " << err
+                  << std::endl;
+        exit(1);
+      }
+      results.emplace_back(result);
+    } else {
+      if (streaming) {
+        err = triton_client.grpc_client_->AsyncStreamInfer(
+            options, inputs, outputs);
+      } else {
+        if (protocol == ProtocolType::HTTP) {
+          err = triton_client.http_client_->AsyncInfer(
+              callback_func, options, inputs, outputs, http_headers);
+        } else {
+          err = triton_client.grpc_client_->AsyncInfer(
+              callback_func, options, inputs, outputs, http_headers);
+        }
+      }
+      if (!err.IsOk()) {
+        std::cerr << "failed sending asynchronous infer request: " << err
+                  << std::endl;
+        exit(1);
+      }
+    }
+    sent_count++;
+  }
+  // For async, retrieve results according to the send order
+  if (async) {
+    // Wait until all callbacks are invoked
+    {
+      std::unique_lock<std::mutex> lk(mtx);
+      cv.wait(lk, [&]() {
+        if (done_cnt >= sent_count) {
+          return true;
+        } else {
+          return false;
+        }
+      });
+    }
+  }
+  // Post-process the results to make prediction(s)
+  for (size_t idx = 0; idx < results.size(); idx++) {
+    std::cout << "Request " << idx << ", batch size " << batch_size
+              << std::endl;
+    Postprocess(
+        std::move(results[idx]), result_filenames[idx], batch_size,
+        model_info.output_name_, topk, model_info.max_batch_size_ != 0);
+  }
+  return 0;
+}
--- a/src/c++/examples/resnet50.cc
+++ b/src/c++/examples/resnet50.cc
+#include <dirent.h>
+#include <getopt.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <algorithm>
+#include <condition_variable>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <mutex>
+#include <queue>
+#include <string>
+#include "grpc_client.h"
+#include "http_client.h"
+#include "json_utils.h"
+#include <opencv2/opencv.hpp>
+#include <opencv2/core/version.hpp>
+#if CV_MAJOR_VERSION == 2
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#elif CV_MAJOR_VERSION >= 3
+#include <opencv2/core.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#endif
+#if CV_MAJOR_VERSION == 4
+#define GET_TRANSFORMATION_CODE(x) cv::COLOR_##x
+#else
+#define GET_TRANSFORMATION_CODE(x) CV_##x
+#endif
+using namespace cv;
+namespace tc = triton::client;
+namespace {
+enum ProtocolType { HTTP = 0, GRPC = 1 };
+struct ModelInfo {
+    std::string output_name_;
+    std::string input_name_;
+    std::string input_datatype_;
+    int input_c_;
+    int input_h_;
+    int input_w_;
+    std::string input_format_;
+    int type1_;
+    int type3_;
+    int max_batch_size_;
+};
+std::vector<float> ComputeSoftmax(const std::vector<float>& results)
+{
+    float maxValue=-3.40e+38F;
+    for(int i=0;i<results.size();++i)
+    {
+        if(results[i]>maxValue)
+        {
+            maxValue=results[i];
+        }
+    }
+    std::vector<float> softmaxResults(results.size());
+    float sum=0.0;
+    for(int i=0;i<results.size();++i)
+    {
+        softmaxResults[i]= exp((float)(results[i] - maxValue));
+        sum+=softmaxResults[i];
+    }
+    for(int i=0;i<results.size();++i)
+    {
+       softmaxResults[i]= softmaxResults[i]/sum;
+    }
+    return softmaxResults;
+}
+void
+Preprocess(
+    const std::string& filename, int img_type1, int img_type3, size_t img_channels, 
+	const cv::Size& img_size, std::vector<uint8_t>* input_data)
+{
+    cv::Mat img = cv::imread(filename, 1);
+    if (img.empty()) {
+        std::cerr << "error: unable to decode image " << filename << std::endl;
+        exit(1);
+    }
+    cv::Mat sample;
+    if ((img.channels() == 3) && (img_channels == 3)) {
+        cv::cvtColor(img, sample, GET_TRANSFORMATION_CODE(BGR2RGB));
+    } else {
+        std::cerr << "unexpected number of channels " << img.channels()
+                  << " in input image, model expects " << img_channels << "."
+                  << std::endl;
+        exit(1);
+    }
+    cv::Mat sample_resized;
+    cv::resize(sample, sample_resized, img_size);
+    cv::Mat sample_type;
+    sample_resized.convertTo(sample_type, (img_channels == 3) ? img_type3 : img_type1);
+    cv::Mat sample_final;
+    sample_final = sample_type.mul(cv::Scalar(1/58.395, 1/57.12, 1/57.375));
+    sample_final = sample_final - cv::Scalar(123.675, 116.28, 103.53);
+    size_t img_byte_size = sample_final.total() * sample_final.elemSize();
+    size_t pos = 0;
+    input_data->resize(img_byte_size);
+    std::vector<cv::Mat> input_bgr_channels;
+    for (size_t i = 0; i < img_channels; ++i) {
+        input_bgr_channels.emplace_back(img_size.height, img_size.width, img_type1, &((*input_data)[pos]));
+        pos += input_bgr_channels.back().total() * input_bgr_channels.back().elemSize();
+    }
+    cv::split(sample_final, input_bgr_channels);
+    if (pos != img_byte_size) {
+        std::cerr << "unexpected total size of channels " << pos << ", expecting "
+                  << img_byte_size << std::endl;
+        exit(1);
+    }
+}
+void Postprocess(
+    const std::unique_ptr<tc::InferResult> result,
+    const std::vector<std::string>& filenames, const size_t batch_size,
+    const std::string& output_name, const bool batching)
+{
+    if (!result->RequestStatus().IsOk()) {
+        std::cerr << "inference  failed with error: " << result->RequestStatus()
+                  << std::endl;
+        exit(1);
+    }
+    if (filenames.size() != batch_size) {
+        std::cerr << "expected " << batch_size << " filenames, got "
+                  << filenames.size() << std::endl;
+        exit(1);
+    }
+    // Get and validate the shape and datatype
+    std::vector<int64_t> shape;
+    tc::Error err = result->Shape(output_name, &shape);
+    if (!err.IsOk()) {
+        std::cerr << "unable to get shape for " << output_name << std::endl;
+        exit(1);
+    }
+    std::string datatype;
+    err = result->Datatype(output_name, &datatype);
+    if (!err.IsOk()) {
+        std::cerr << "unable to get datatype for " << output_name << std::endl;
+        exit(1);
+    }
+    const uint8_t* result_data;
+    size_t outputCount = 0;
+    err = result->RawData(output_name, &result_data, &outputCount);
+    if (!err.IsOk()) {
+        std::cerr << "unable to get data for " << output_name << std::endl;
+        exit(1);
+    }
+    float* pdata = new float[1000];
+    memcpy(pdata, result_data, sizeof(uint8_t)*outputCount);
+    std::vector<float> logit;
+    for(int j=0; j<1000; ++j)
+    {
+        logit.push_back(pdata[j]);
+    }
+    std::vector<float> probs = ComputeSoftmax(logit);
+    for(int j=0; j<1000; ++j)
+    {
+        if (probs[j] >= 0.5)
+            fprintf(stdout,"label:%d,confidence:%.3f\n", j, probs[j]);
+    }
+}
+bool ParseType(const std::string& dtype, int* type1, int* type3)
+{
+    if (dtype.compare("UINT8") == 0) {
+        *type1 = CV_8UC1;
+        *type3 = CV_8UC3;
+    } else if (dtype.compare("INT8") == 0) {
+        *type1 = CV_8SC1;
+        *type3 = CV_8SC3;
+    } else if (dtype.compare("UINT16") == 0) {
+        *type1 = CV_16UC1;
+        *type3 = CV_16UC3;
+    } else if (dtype.compare("INT16") == 0) {
+        *type1 = CV_16SC1;
+        *type3 = CV_16SC3;
+    } else if (dtype.compare("INT32") == 0) {
+        *type1 = CV_32SC1;
+        *type3 = CV_32SC3;
+    } else if (dtype.compare("FP32") == 0) {
+        *type1 = CV_32FC1;
+        *type3 = CV_32FC3;
+    } else if (dtype.compare("FP64") == 0) {
+        *type1 = CV_64FC1;
+        *type3 = CV_64FC3;
+    } else {
+        return false;
+    }
+    return true;
+}
+void ParseModelHttp(
+    const rapidjson::Document& model_metadata,
+    const rapidjson::Document& model_config, const size_t batch_size,
+    ModelInfo* model_info)
+{
+    const auto& input_itr = model_metadata.FindMember("inputs");
+    size_t input_count = 0;
+    if (input_itr != model_metadata.MemberEnd()) {
+        input_count = input_itr->value.Size();
+    }
+    if (input_count != 1) {
+        std::cerr << "expecting 1 input, got " << input_count << std::endl;
+        exit(1);
+    }
+    const auto& output_itr = model_metadata.FindMember("outputs");
+    size_t output_count = 0;
+    if (output_itr != model_metadata.MemberEnd()) {
+        output_count = output_itr->value.Size();
+    }
+    if (output_count != 1) {
+        std::cerr << "expecting 1 output, got " << output_count << std::endl;
+        exit(1);
+    }
+    const auto& input_config_itr = model_config.FindMember("input");
+    input_count = 0;
+    if (input_config_itr != model_config.MemberEnd()) {
+        input_count = input_config_itr->value.Size();
+    }
+    if (input_count != 1) {
+        std::cerr << "expecting 1 input in model configuration, got " << input_count
+                  << std::endl;
+        exit(1);
+    }
+    const auto& input_metadata = *input_itr->value.Begin();
+    const auto& input_config = *input_config_itr->value.Begin();
+    const auto& output_metadata = *output_itr->value.Begin();
+    const auto& output_dtype_itr = output_metadata.FindMember("datatype");
+    if (output_dtype_itr == output_metadata.MemberEnd()) {
+        std::cerr << "output missing datatype in the metadata for model'"
+                  << model_metadata["name"].GetString() << "'" << std::endl;
+        exit(1);
+    }
+    auto datatype = std::string(output_dtype_itr->value.GetString(),
+                    output_dtype_itr->value.GetStringLength());
+    if (datatype.compare("FP32") != 0) {
+        std::cerr << "expecting output datatype to be FP32, model '"
+                  << model_metadata["name"].GetString() << "' output type is '"
+                  << datatype << "'" << std::endl;
+        exit(1);
+    }
+    int max_batch_size = 0;
+    const auto bs_itr = model_config.FindMember("max_batch_size");
+    if (bs_itr != model_config.MemberEnd()) {
+        max_batch_size = bs_itr->value.GetUint();
+    }
+    model_info->max_batch_size_ = max_batch_size;
+    if (max_batch_size == 0) {
+        if (batch_size != 1) {
+            std::cerr << "batching not supported for model '"
+                      << model_metadata["name"].GetString() << "'" << std::endl;
+            exit(1);
+        }
+    } else {
+        if (batch_size > (size_t)max_batch_size) {
+            std::cerr << "expecting batch size <= " << max_batch_size
+                      << " for model '" << model_metadata["name"].GetString() << "'"
+                      << std::endl;
+            exit(1);
+        }
+    }
+    const bool input_batch_dim = (max_batch_size == 0);
+    const size_t expected_input_dims = 3 + (input_batch_dim ? 1 : 0);
+    const auto input_shape_itr = input_metadata.FindMember("shape");
+    model_info->input_format_ = std::string(input_config["format"].GetString(), input_config["format"].GetStringLength());
+    model_info->output_name_ = std::string(output_metadata["name"].GetString(), output_metadata["name"].GetStringLength());
+    model_info->input_name_ = std::string(input_metadata["name"].GetString(), input_metadata["name"].GetStringLength());
+    model_info->input_datatype_ = std::string(input_metadata["datatype"].GetString(), input_metadata["datatype"].GetStringLength());
+    model_info->input_c_ = input_shape_itr->value[1].GetInt();
+    model_info->input_h_ = input_shape_itr->value[2].GetInt();
+    model_info->input_w_ = input_shape_itr->value[3].GetInt();
+    if (!ParseType(model_info->input_datatype_, &(model_info->type1_), &(model_info->type3_))) {
+        std::cerr << "unexpected input datatype '" << model_info->input_datatype_
+                  << "' for model \"" << model_metadata["name"].GetString()
+                  << std::endl;
+        exit(1);
+    }
+}
+union TritonClient {
+    TritonClient()
+    {
+        new (&http_client_) std::unique_ptr<tc::InferenceServerHttpClient>{};
+    }
+    ~TritonClient() {}
+    std::unique_ptr<tc::InferenceServerHttpClient> http_client_;
+    std::unique_ptr<tc::InferenceServerGrpcClient> grpc_client_;
+};
+}
+int
+main(int argc, char** argv)
+{
+    bool verbose = false;
+    bool async = false;
+    int batch_size = 1;
+    if (argc < 3 || argc > 3)
+    {
+        fprintf(stdout, "Two args are required: ./a resnet_50 image_path\n");
+        return -1;
+    }
+    std::string model_name = argv[1];
+    std::string fileName = argv[2];
+    std::string preprocess_output_filename;
+    std::string model_version = "";
+    std::string url("localhost:8000");
+    ProtocolType protocol = ProtocolType::HTTP;
+    tc::Headers http_headers;
+    TritonClient triton_client;
+    tc::Error err;
+    err = tc::InferenceServerHttpClient::Create(
+          &triton_client.http_client_, url, verbose);
+    if (!err.IsOk()) {
+        std::cerr << "error: unable to create client for inference: " << err << std::endl;
+        exit(1);
+    }
+    ModelInfo model_info;
+    std::string model_metadata;
+    err = triton_client.http_client_->ModelMetadata(&model_metadata, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+        std::cerr << "error: failed to get model metadata: " << err << std::endl;
+    }
+    rapidjson::Document model_metadata_json;
+    err = tc::ParseJson(&model_metadata_json, model_metadata);
+    if (!err.IsOk()) {
+        std::cerr << "error: failed to parse model metadata: " << err
+                  << std::endl;
+    }
+    std::string model_config;
+    err = triton_client.http_client_->ModelConfig(&model_config, model_name, model_version, http_headers);
+    if (!err.IsOk()) {
+        std::cerr << "error: failed to get model config: " << err << std::endl;
+    }
+    rapidjson::Document model_config_json;
+    err = tc::ParseJson(&model_config_json, model_config);
+    if (!err.IsOk()) {
+        std::cerr << "error: failed to parse model config: " << err << std::endl;
+    }
+    ParseModelHttp( model_metadata_json, model_config_json, batch_size, &model_info);
+    std::vector<std::string> image_filenames;
+    struct stat name_stat;
+    if (stat(fileName.c_str(), &name_stat) != 0) {
+        std::cerr << "Failed to find '" << fileName << "': " << strerror(errno) << std::endl;
+        exit(1);
+    }
+    if (name_stat.st_mode & S_IFDIR) {
+        const std::string dirname = fileName;
+        DIR* dir_ptr = opendir(dirname.c_str());
+        struct dirent* d_ptr;
+        while ((d_ptr = readdir(dir_ptr)) != NULL) {
+            const std::string filename = d_ptr->d_name;
+            if ((filename != ".") && (filename != "..")) {
+                image_filenames.push_back(dirname + "/" + filename);
+            }
+        }
+        closedir(dir_ptr);
+    } else {
+        image_filenames.push_back(fileName);
+    }
+    std::sort(image_filenames.begin(), image_filenames.end());
+    std::vector<std::vector<uint8_t>> image_data;
+    for (const auto& fn : image_filenames) {
+        image_data.emplace_back();
+        Preprocess(fn, model_info.type1_, model_info.type3_, model_info.input_c_, 
+                  cv::Size(model_info.input_w_, model_info.input_h_), &(image_data.back()));
+        if ((image_data.size() == 1) && !preprocess_output_filename.empty()) {
+            std::ofstream output_file(preprocess_output_filename);
+            std::ostream_iterator<uint8_t> output_iterator(output_file);
+            std::copy(image_data[0].begin(), image_data[0].end(), output_iterator);
+        }
+    }
+    std::vector<int64_t> shape;
+    shape.push_back(batch_size);
+    shape.push_back(model_info.input_c_);
+    shape.push_back(model_info.input_h_);
+    shape.push_back(model_info.input_w_);
+    tc::InferInput* input;
+    err = tc::InferInput::Create(&input, model_info.input_name_, shape, model_info.input_datatype_);
+    if (!err.IsOk()) {
+        std::cerr << "unable to get input: " << err << std::endl;
+        exit(1);
+    }
+    std::shared_ptr<tc::InferInput> input_ptr(input);
+    tc::InferRequestedOutput* output;
+    err = tc::InferRequestedOutput::Create(&output, model_info.output_name_);
+    if (!err.IsOk()) {
+        std::cerr << "unable to get output: " << err << std::endl;
+        exit(1);
+    }
+    std::shared_ptr<tc::InferRequestedOutput> output_ptr(output);
+    std::vector<tc::InferInput*> inputs = {input_ptr.get()};
+    std::vector<const tc::InferRequestedOutput*> outputs = {output_ptr.get()};
+    tc::InferOptions options(model_name);
+    options.model_version_ = model_version;
+    std::vector<std::unique_ptr<tc::InferResult>> results;
+    std::vector<std::vector<std::string>> result_filenames;
+    size_t image_idx = 0;
+    size_t done_cnt = 0;
+    size_t sent_count = 0;
+    bool last_request = false;
+    std::mutex mtx;
+    std::condition_variable cv;
+    auto callback_func = [&](tc::InferResult* result) 
+    {
+        {
+            std::lock_guard<std::mutex> lk(mtx);
+            results.emplace_back(result);
+            done_cnt++;
+        }
+        cv.notify_all();
+    };
+    while (!last_request) {
+        err = input_ptr->Reset();
+        if (!err.IsOk()) {
+            std::cerr << "failed resetting input: " << err << std::endl;
+            exit(1);
+        }
+        std::vector<std::string> input_filenames;
+        for (int idx = 0; idx < batch_size; ++idx) {
+            input_filenames.push_back(image_filenames[image_idx]);
+            err = input_ptr->AppendRaw(image_data[image_idx]);
+            if (!err.IsOk()) {
+                std::cerr << "failed setting input: " << err << std::endl;
+                exit(1);
+            }
+            image_idx = (image_idx + 1) % image_data.size();
+            if (image_idx == 0) {
+                last_request = true;
+            }
+        }
+        result_filenames.emplace_back(std::move(input_filenames));
+        options.request_id_ = std::to_string(sent_count);
+        double time1 = getTickCount();  
+        tc::InferResult* result;
+        if (protocol == ProtocolType::HTTP) {
+            err = triton_client.http_client_->Infer(
+                  &result, options, inputs, outputs, http_headers);
+        } else {
+            err = triton_client.grpc_client_->Infer(
+                  &result, options, inputs, outputs, http_headers);
+        }
+        if (!err.IsOk()) {
+            std::cerr << "failed sending synchronous infer request: " << err
+                      << std::endl;
+            exit(1);
+        }
+        results.emplace_back(result);
+        double time2 = getTickCount();
+        double elapsedTime = (time2 - time1)*1000 / getTickFrequency();
+        fprintf(stdout, "inference time:%f ms\n", elapsedTime);
+        sent_count++;
+    }
+    for (size_t idx = 0; idx < results.size(); idx++) {
+        std::cout << "Request " << idx << ", batch size " << batch_size << std::endl;
+        Postprocess(
+            std::move(results[idx]), result_filenames[idx], batch_size,
+            model_info.output_name_, model_info.max_batch_size_ != 0);
+    }
+  return 0;
+}
--- a/src/c++/examples/reuse_infer_objects_client.cc
+++ b/src/c++/examples/reuse_infer_objects_client.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+#include "http_client.h"
+#include "shm_utils.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+union TritonClient {
+  TritonClient()
+  {
+    new (&http_client_) std::unique_ptr<tc::InferenceServerHttpClient>{};
+  }
+  ~TritonClient() {}
+  std::unique_ptr<tc::InferenceServerHttpClient> http_client_;
+  std::unique_ptr<tc::InferenceServerGrpcClient> grpc_client_;
+};
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+InferAndValidate(
+    const bool use_shared_memory, TritonClient& triton_client,
+    const std::string& protocol, const tc::InferOptions& options,
+    const tc::Headers& http_headers, std::vector<tc::InferInput*>& inputs,
+    const size_t input_byte_size,
+    std::vector<tc::InferRequestedOutput*>& outputs,
+    const size_t output_byte_size, std::vector<int*>& shm_ptrs)
+{
+  std::vector<int32_t> input0_data(16);
+  std::vector<int32_t> input1_data(16);
+  int32_t* input0_data_ptr;
+  int32_t* input1_data_ptr;
+  int32_t* output0_data_ptr;
+  int32_t* output1_data_ptr;
+  FAIL_IF_ERR(inputs[0]->Reset(), "unable to reset input 'INPUT0'");
+  FAIL_IF_ERR(inputs[1]->Reset(), "unable to reset input 'INPUT1'");
+  if (use_shared_memory) {
+    input0_data_ptr = shm_ptrs[0];
+    input1_data_ptr = shm_ptrs[1];
+    FAIL_IF_ERR(
+        inputs[0]->SetSharedMemory(
+            "input_data", input_byte_size, 0 /* offset */),
+        "unable to set shared memory for INPUT0");
+    FAIL_IF_ERR(
+        inputs[1]->SetSharedMemory(
+            "input_data", input_byte_size, input_byte_size /* offset */),
+        "unable to set shared memory for INPUT1");
+    FAIL_IF_ERR(
+        outputs[0]->SetSharedMemory(
+            "output_data", output_byte_size, 0 /* offset */),
+        "unable to set shared memory for 'OUTPUT0'");
+    FAIL_IF_ERR(
+        outputs[1]->SetSharedMemory(
+            "output_data", output_byte_size, output_byte_size /* offset */),
+        "unable to set shared memory for 'OUTPUT1'");
+  } else {
+    input0_data_ptr = &input0_data[0];
+    input1_data_ptr = &input1_data[0];
+    // Create the data for the two input tensors. Initialize the first
+    // to unique integers and the second to all twos. We use twos instead
+    // of ones in input1_data to validate whether inputs were set correctly.
+    for (size_t i = 0; i < 16; ++i) {
+      input0_data[i] = i;
+      input1_data[i] = 2;
+    }
+    FAIL_IF_ERR(
+        inputs[0]->AppendRaw(
+            reinterpret_cast<uint8_t*>(&input0_data[0]),
+            input0_data.size() * sizeof(int32_t)),
+        "unable to set data for 'INPUT0'");
+    FAIL_IF_ERR(
+        inputs[1]->AppendRaw(
+            reinterpret_cast<uint8_t*>(&input1_data[0]),
+            input1_data.size() * sizeof(int32_t)),
+        "unable to set data for 'INPUT1'");
+    FAIL_IF_ERR(
+        outputs[0]->UnsetSharedMemory(),
+        "unable to unset shared memory for 'OUTPUT0'");
+    FAIL_IF_ERR(
+        outputs[1]->UnsetSharedMemory(),
+        "unable to unset shared memory for 'OUTPUT1'");
+  }
+  std::vector<const tc::InferRequestedOutput*> routputs = {
+      outputs[0], outputs[1]};
+  tc::InferResult* results;
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->Infer(
+            &results, options, inputs, routputs, http_headers),
+        "unable to run model");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->Infer(
+            &results, options, inputs, routputs, http_headers),
+        "unable to run model");
+  }
+  std::shared_ptr<tc::InferResult> results_ptr;
+  results_ptr.reset(results);
+  // Validate the results...
+  ValidateShapeAndDatatype("OUTPUT0", results_ptr);
+  ValidateShapeAndDatatype("OUTPUT1", results_ptr);
+  if (use_shared_memory) {
+    std::cout << "\n\n======== SHARED_MEMORY ========\n";
+    output0_data_ptr = shm_ptrs[2];
+    output1_data_ptr = shm_ptrs[3];
+  } else {
+    std::cout << "\n\n======== NO_SHARED_MEMORY ========\n";
+    // Get pointers to the result returned...
+    size_t recv_output0_byte_size;
+    FAIL_IF_ERR(
+        results_ptr->RawData(
+            "OUTPUT0", (const uint8_t**)&output0_data_ptr,
+            &recv_output0_byte_size),
+        "unable to get result data for 'OUTPUT0'");
+    if (recv_output0_byte_size != output_byte_size) {
+      std::cerr << "error: received incorrect byte size for 'OUTPUT0': "
+                << recv_output0_byte_size << std::endl;
+      exit(1);
+    }
+    size_t recv_output1_byte_size;
+    FAIL_IF_ERR(
+        results_ptr->RawData(
+            "OUTPUT1", (const uint8_t**)&output1_data_ptr,
+            &recv_output1_byte_size),
+        "unable to get result data for 'OUTPUT1'");
+    if (recv_output1_byte_size != output_byte_size) {
+      std::cerr << "error: received incorrect byte size for 'OUTPUT1': "
+                << recv_output1_byte_size << std::endl;
+      exit(1);
+    }
+  }
+  for (size_t i = 0; i < 16; ++i) {
+    std::cout << input0_data_ptr[i] << " + " << input1_data_ptr[i] << " = "
+              << output0_data_ptr[i] << std::endl;
+    std::cout << input0_data_ptr[i] << " - " << input1_data_ptr[i] << " = "
+              << output1_data_ptr[i] << std::endl;
+    if ((input0_data_ptr[i] + input1_data_ptr[i]) != output0_data_ptr[i]) {
+      std::cerr << "error: incorrect sum" << std::endl;
+      exit(1);
+    }
+    if ((input0_data_ptr[i] - input1_data_ptr[i]) != output1_data_ptr[i]) {
+      std::cerr << "error: incorrect difference" << std::endl;
+      exit(1);
+    }
+  }
+  std::cout << "\n======== END ========\n\n";
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+// Tests whether the same InferInput and InferRequestedOutput objects can be
+// successfully used repeatedly for different inferences using/not-using
+// shared memory.
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8000");
+  bool url_specified = false;
+  tc::Headers http_headers;
+  std::string protocol("http");
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vu:i:H:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        url_specified = true;
+        break;
+      case 'i':
+        protocol = optarg;
+        std::transform(
+            protocol.begin(), protocol.end(), protocol.begin(), ::tolower);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create the inference client for the server. From it
+  // extract and validate that the model meets the requirements for
+  // image classification.
+  TritonClient triton_client;
+  tc::Error err;
+  if (protocol == "http") {
+    err = tc::InferenceServerHttpClient::Create(
+        &triton_client.http_client_, url, verbose);
+  } else if (protocol == "grpc") {
+    if (!url_specified) {
+      url = "localhost:8001";
+    }
+    err = tc::InferenceServerGrpcClient::Create(
+        &triton_client.grpc_client_, url, verbose);
+  } else {
+    std::cerr
+        << "error: unsupported protocol provided: only supports grpc or http."
+        << std::endl;
+    exit(1);
+  }
+  if (!err.IsOk()) {
+    std::cerr << "error: unable to create client for inference: " << err
+              << std::endl;
+    exit(1);
+  }
+  // Unregistering all shared memory regions for a clean
+  // start.
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->UnregisterSystemSharedMemory(),
+        "unable to unregister all system shared memory regions");
+    FAIL_IF_ERR(
+        triton_client.http_client_->UnregisterCudaSharedMemory(),
+        "unable to unregister all cuda shared memory regions");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->UnregisterSystemSharedMemory(),
+        "unable to unregister all system shared memory regions");
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->UnregisterCudaSharedMemory(),
+        "unable to unregister all cuda shared memory regions");
+  }
+  std::vector<int64_t> shape{1, 16};
+  size_t input_byte_size = 64;
+  size_t output_byte_size = 64;
+  // Initialize the inputs with the data.
+  tc::InferInput* input0;
+  tc::InferInput* input1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+      "unable to get INPUT0");
+  std::shared_ptr<tc::InferInput> input0_ptr;
+  input0_ptr.reset(input0);
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+      "unable to get INPUT1");
+  std::shared_ptr<tc::InferInput> input1_ptr;
+  input1_ptr.reset(input1);
+  // Create Input0 and Input1 in Shared Memory. Initialize Input0 to unique
+  // integers and Input1 to all ones.
+  std::string shm_key = "/input_simple";
+  int shm_fd_ip, *input0_shm;
+  FAIL_IF_ERR(
+      tc::CreateSharedMemoryRegion(shm_key, input_byte_size * 2, &shm_fd_ip),
+      "");
+  FAIL_IF_ERR(
+      tc::MapSharedMemory(
+          shm_fd_ip, 0, input_byte_size * 2, (void**)&input0_shm),
+      "");
+  FAIL_IF_ERR(tc::CloseSharedMemory(shm_fd_ip), "");
+  int* input1_shm = (int*)(input0_shm + 16);
+  for (size_t i = 0; i < 16; ++i) {
+    *(input0_shm + i) = i;
+    *(input1_shm + i) = 1;
+  }
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->RegisterSystemSharedMemory(
+            "input_data", "/input_simple", input_byte_size * 2),
+        "failed to register input shared memory region");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->RegisterSystemSharedMemory(
+            "input_data", "/input_simple", input_byte_size * 2),
+        "failed to register input shared memory region");
+  }
+  // Generate the outputs to be requested.
+  tc::InferRequestedOutput* output0;
+  tc::InferRequestedOutput* output1;
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+      "unable to get 'OUTPUT0'");
+  std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+  output0_ptr.reset(output0);
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+      "unable to get 'OUTPUT1'");
+  std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+  output1_ptr.reset(output1);
+  // Create Output0 and Output1 in Shared Memory
+  shm_key = "/output_simple";
+  int shm_fd_op;
+  int* output0_shm;
+  FAIL_IF_ERR(
+      tc::CreateSharedMemoryRegion(shm_key, output_byte_size * 2, &shm_fd_op),
+      "");
+  FAIL_IF_ERR(
+      tc::MapSharedMemory(
+          shm_fd_op, 0, output_byte_size * 2, (void**)&output0_shm),
+      "");
+  FAIL_IF_ERR(tc::CloseSharedMemory(shm_fd_op), "");
+  int* output1_shm = (int*)(output0_shm + 16);
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->RegisterSystemSharedMemory(
+            "output_data", "/output_simple", output_byte_size * 2),
+        "failed to register output shared memory region");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->RegisterSystemSharedMemory(
+            "output_data", "/output_simple", output_byte_size * 2),
+        "failed to register output shared memory region");
+  }
+  std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+  std::vector<tc::InferRequestedOutput*> outputs = {
+      output0_ptr.get(), output1_ptr.get()};
+  std::vector<int*> shm_ptrs = {
+      input0_shm, input1_shm, output0_shm, output1_shm};
+  // The inference settings. Will be using default for now.
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  // Issue inference using shared memory
+  InferAndValidate(
+      true /* use_shared_memory */, triton_client, protocol, options,
+      http_headers, inputs, input_byte_size, outputs, output_byte_size,
+      shm_ptrs);
+  // Issue inference without using shared memory
+  InferAndValidate(
+      false /* use_shared_memory */, triton_client, protocol, options,
+      http_headers, inputs, input_byte_size, outputs, output_byte_size,
+      shm_ptrs);
+  // Issue inference using shared memory
+  InferAndValidate(
+      true /* use_shared_memory */, triton_client, protocol, options,
+      http_headers, inputs, input_byte_size, outputs, output_byte_size,
+      shm_ptrs);
+  // Unregister shared memory
+  if (protocol == "http") {
+    FAIL_IF_ERR(
+        triton_client.http_client_->UnregisterSystemSharedMemory("input_data"),
+        "unable to unregister shared memory input region");
+    FAIL_IF_ERR(
+        triton_client.http_client_->UnregisterSystemSharedMemory("output_data"),
+        "unable to unregister shared memory output region");
+  } else {
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->UnregisterSystemSharedMemory("input_data"),
+        "unable to unregister shared memory input region");
+    FAIL_IF_ERR(
+        triton_client.grpc_client_->UnregisterSystemSharedMemory("output_data"),
+        "unable to unregister shared memory output region");
+  }
+  // Cleanup shared memory
+  FAIL_IF_ERR(tc::UnmapSharedMemory(input0_shm, input_byte_size * 2), "");
+  FAIL_IF_ERR(tc::UnlinkSharedMemoryRegion("/input_simple"), "");
+  FAIL_IF_ERR(tc::UnmapSharedMemory(output0_shm, output_byte_size * 2), "");
+  FAIL_IF_ERR(tc::UnlinkSharedMemoryRegion("/output_simple"), "");
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_async_infer_client.cc
+++ b/src/c++/examples/simple_grpc_async_infer_client.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <unistd.h>
+#include <condition_variable>
+#include <iostream>
+#include <mutex>
+#include <string>
+#include "grpc_client.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+ValidateResult(
+    const std::shared_ptr<tc::InferResult> result,
+    std::vector<int32_t>& input0_data, std::vector<int32_t>& input1_data)
+{
+  // Validate the results...
+  ValidateShapeAndDatatype("OUTPUT0", result);
+  ValidateShapeAndDatatype("OUTPUT1", result);
+  // Get pointers to the result returned...
+  int32_t* output0_data;
+  size_t output0_byte_size;
+  FAIL_IF_ERR(
+      result->RawData(
+          "OUTPUT0", (const uint8_t**)&output0_data, &output0_byte_size),
+      "unable to get result data for 'OUTPUT0'");
+  if (output0_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT0': "
+              << output0_byte_size << std::endl;
+    exit(1);
+  }
+  int32_t* output1_data;
+  size_t output1_byte_size;
+  FAIL_IF_ERR(
+      result->RawData(
+          "OUTPUT1", (const uint8_t**)&output1_data, &output1_byte_size),
+      "unable to get result data for 'OUTPUT1'");
+  if (output0_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT1': "
+              << output0_byte_size << std::endl;
+    exit(1);
+  }
+  for (size_t i = 0; i < 16; ++i) {
+    std::cout << input0_data[i] << " + " << input1_data[i] << " = "
+              << *(output0_data + i) << std::endl;
+    std::cout << input0_data[i] << " - " << input1_data[i] << " = "
+              << *(output1_data + i) << std::endl;
+    if ((input0_data[i] + input1_data[i]) != *(output0_data + i)) {
+      std::cerr << "error: incorrect sum" << std::endl;
+      exit(1);
+    }
+    if ((input0_data[i] - input1_data[i]) != *(output1_data + i)) {
+      std::cerr << "error: incorrect difference" << std::endl;
+      exit(1);
+    }
+  }
+  // Get full response
+  std::cout << result->DebugString() << std::endl;
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-t <client timeout in microseconds>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  uint32_t client_timeout = 0;
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vu:t:H:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 't':
+        client_timeout = std::stoi(optarg);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(&client, url, verbose),
+      "unable to create grpc client");
+  // Create the data for the two input tensors. Initialize the first
+  // to unique integers and the second to all ones.
+  std::vector<int32_t> input0_data(16);
+  std::vector<int32_t> input1_data(16);
+  for (size_t i = 0; i < 16; ++i) {
+    input0_data[i] = i;
+    input1_data[i] = 1;
+  }
+  std::vector<int64_t> shape{1, 16};
+  // Initialize the inputs with the data.
+  tc::InferInput* input0;
+  tc::InferInput* input1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+      "unable to get INPUT0");
+  std::shared_ptr<tc::InferInput> input0_ptr;
+  input0_ptr.reset(input0);
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+      "unable to get INPUT1");
+  std::shared_ptr<tc::InferInput> input1_ptr;
+  input1_ptr.reset(input1);
+  FAIL_IF_ERR(
+      input0_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input0_data[0]),
+          input0_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT0");
+  FAIL_IF_ERR(
+      input1_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input1_data[0]),
+          input1_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT1");
+  // Generate the outputs to be requested.
+  tc::InferRequestedOutput* output0;
+  tc::InferRequestedOutput* output1;
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+      "unable to get 'OUTPUT0'");
+  std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+  output0_ptr.reset(output0);
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+      "unable to get 'OUTPUT1'");
+  std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+  output1_ptr.reset(output1);
+  // The inference settings. Will be using default for now.
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  options.client_timeout_ = client_timeout;
+  std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {
+      output0_ptr.get(), output1_ptr.get()};
+  // Send inference request to the inference server.
+  std::mutex mtx;
+  std::condition_variable cv;
+  size_t repeat_cnt = 2;
+  size_t done_cnt = 0;
+  for (size_t i = 0; i < repeat_cnt; i++) {
+    FAIL_IF_ERR(
+        client->AsyncInfer(
+            [&, i](tc::InferResult* result) {
+              {
+                std::shared_ptr<tc::InferResult> result_ptr;
+                result_ptr.reset(result);
+                std::lock_guard<std::mutex> lk(mtx);
+                std::cout << "Callback no." << i << " is called" << std::endl;
+                done_cnt++;
+                if (result_ptr->RequestStatus().IsOk()) {
+                  ValidateResult(result_ptr, input0_data, input1_data);
+                } else {
+                  std::cerr << "error: Inference failed: "
+                            << result_ptr->RequestStatus() << std::endl;
+                  exit(1);
+                }
+              }
+              cv.notify_all();
+            },
+            options, inputs, outputs, http_headers),
+        "unable to run model");
+  }
+  // Wait until all callbacks are invoked
+  {
+    std::unique_lock<std::mutex> lk(mtx);
+    cv.wait(lk, [&]() {
+      if (done_cnt >= repeat_cnt) {
+        return true;
+      } else {
+        return false;
+      }
+    });
+  }
+  if (done_cnt == repeat_cnt) {
+    std::cout << "All done" << std::endl;
+  } else {
+    std::cerr << "Done cnt: " << done_cnt
+              << " does not match repeat cnt: " << repeat_cnt << std::endl;
+    exit(1);
+  }
+  // Send another AsyncInfer whose callback defers the completed request
+  // to another thread (main thread) to handle
+  bool callback_invoked = false;
+  std::shared_ptr<tc::InferResult> result_placeholder;
+  FAIL_IF_ERR(
+      client->AsyncInfer(
+          [&](tc::InferResult* result) {
+            {
+              std::shared_ptr<tc::InferResult> result_ptr;
+              result_ptr.reset(result);
+              // Defer the response retrieval to main thread
+              std::lock_guard<std::mutex> lk(mtx);
+              callback_invoked = true;
+              result_placeholder = std::move(result_ptr);
+            }
+            cv.notify_all();
+          },
+          options, inputs, outputs, http_headers),
+      "unable to run model");
+  // Ensure callback is completed
+  {
+    std::unique_lock<std::mutex> lk(mtx);
+    cv.wait(lk, [&]() { return callback_invoked; });
+  }
+  // Get deferred response
+  std::cout << "Getting results from deferred response" << std::endl;
+  if (result_placeholder->RequestStatus().IsOk()) {
+    ValidateResult(result_placeholder, input0_data, input1_data);
+  } else {
+    std::cerr << "error: Inference failed: "
+              << result_placeholder->RequestStatus() << std::endl;
+    exit(1);
+  }
+  tc::InferStat infer_stat;
+  client->ClientInferStat(&infer_stat);
+  std::cout << "completed_request_count " << infer_stat.completed_request_count
+            << std::endl;
+  std::cout << "cumulative_total_request_time_ns "
+            << infer_stat.cumulative_total_request_time_ns << std::endl;
+  std::cout << "cumulative_send_time_ns " << infer_stat.cumulative_send_time_ns
+            << std::endl;
+  std::cout << "cumulative_receive_time_ns "
+            << infer_stat.cumulative_receive_time_ns << std::endl;
+  std::cout << "PASS : Async Infer" << std::endl;
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_cudashm_client.cc
+++ b/src/c++/examples/simple_grpc_cudashm_client.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <cuda_runtime_api.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+#include "shm_utils.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+#define FAIL_IF_CUDA_ERR(FUNC)                                     \
+  {                                                                \
+    const cudaError_t result = FUNC;                               \
+    if (result != cudaSuccess) {                                   \
+      std::cerr << "CUDA exception (line " << __LINE__             \
+                << "): " << cudaGetErrorName(result) << " ("       \
+                << cudaGetErrorString(result) << ")" << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+void
+CreateCUDAIPCHandle(
+    cudaIpcMemHandle_t* cuda_handle, void* input_d_ptr, int device_id = 0)
+{
+  // Set the GPU device to the desired GPU
+  FAIL_IF_CUDA_ERR(cudaSetDevice(device_id));
+  //  Create IPC handle for data on the gpu
+  FAIL_IF_CUDA_ERR(cudaIpcGetMemHandle(cuda_handle, input_d_ptr));
+}
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vu:H:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(&client, url, verbose),
+      "unable to create grpc client");
+  // Unregistering all shared memory regions for a clean
+  // start.
+  FAIL_IF_ERR(
+      client->UnregisterSystemSharedMemory(),
+      "unable to unregister all system shared memory regions");
+  FAIL_IF_ERR(
+      client->UnregisterCudaSharedMemory(),
+      "unable to unregister all cuda shared memory regions");
+  std::vector<int64_t> shape{1, 16};
+  size_t input_byte_size = 64;
+  size_t output_byte_size = 64;
+  // Initialize the inputs with the data.
+  tc::InferInput* input0;
+  tc::InferInput* input1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+      "unable to get INPUT0");
+  std::shared_ptr<tc::InferInput> input0_ptr;
+  input0_ptr.reset(input0);
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+      "unable to get INPUT1");
+  std::shared_ptr<tc::InferInput> input1_ptr;
+  input1_ptr.reset(input1);
+  // Create Input0 and Input1 in CUDA Shared Memory. Initialize Input0 to
+  // unique integers and Input1 to all ones.
+  int input_data[32];
+  for (size_t i = 0; i < 16; ++i) {
+    input_data[i] = i;
+    input_data[16 + i] = 1;
+  }
+  // copy INPUT0 and INPUT1 data in GPU shared memory
+  int* input_d_ptr;
+  cudaMalloc((void**)&input_d_ptr, input_byte_size * 2);
+  cudaMemcpy(
+      (void*)input_d_ptr, (void*)input_data, input_byte_size * 2,
+      cudaMemcpyHostToDevice);
+  cudaIpcMemHandle_t input_cuda_handle;
+  CreateCUDAIPCHandle(&input_cuda_handle, (void*)input_d_ptr);
+  FAIL_IF_ERR(
+      client->RegisterCudaSharedMemory(
+          "input_data", input_cuda_handle, 0 /* device_id */,
+          input_byte_size * 2),
+      "failed to register input shared memory region");
+  FAIL_IF_ERR(
+      input0_ptr->SetSharedMemory(
+          "input_data", input_byte_size, 0 /* offset */),
+      "unable to set shared memory for INPUT0");
+  FAIL_IF_ERR(
+      input1_ptr->SetSharedMemory(
+          "input_data", input_byte_size, input_byte_size /* offset */),
+      "unable to set shared memory for INPUT1");
+  // Generate the outputs to be requested.
+  tc::InferRequestedOutput* output0;
+  tc::InferRequestedOutput* output1;
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+      "unable to get 'OUTPUT0'");
+  std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+  output0_ptr.reset(output0);
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+      "unable to get 'OUTPUT1'");
+  std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+  output1_ptr.reset(output1);
+  // Create Output0 and Output1 in CUDA Shared Memory
+  int *output0_d_ptr, *output1_d_ptr;
+  cudaMalloc((void**)&output0_d_ptr, output_byte_size * 2);
+  output1_d_ptr = (int*)output0_d_ptr + 16;
+  cudaIpcMemHandle_t output_cuda_handle;
+  CreateCUDAIPCHandle(&output_cuda_handle, (void*)output0_d_ptr);
+  FAIL_IF_ERR(
+      client->RegisterCudaSharedMemory(
+          "output_data", output_cuda_handle, 0 /* device_id */,
+          output_byte_size * 2),
+      "failed to register output shared memory region");
+  FAIL_IF_ERR(
+      output0_ptr->SetSharedMemory(
+          "output_data", output_byte_size, 0 /* offset */),
+      "unable to set shared memory for 'OUTPUT0'");
+  FAIL_IF_ERR(
+      output1_ptr->SetSharedMemory(
+          "output_data", output_byte_size, output_byte_size /* offset */),
+      "unable to set shared memory for 'OUTPUT1'");
+  // The inference settings. Will be using default for now.
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {
+      output0_ptr.get(), output1_ptr.get()};
+  tc::InferResult* results;
+  FAIL_IF_ERR(
+      client->Infer(&results, options, inputs, outputs, http_headers),
+      "unable to run model");
+  std::shared_ptr<tc::InferResult> results_ptr;
+  results_ptr.reset(results);
+  // Validate the results...
+  ValidateShapeAndDatatype("OUTPUT0", results_ptr);
+  ValidateShapeAndDatatype("OUTPUT1", results_ptr);
+  // Copy input and output data back to the CPU
+  int output0_data[16], output1_data[16];
+  cudaMemcpy(
+      output0_data, output0_d_ptr, output_byte_size, cudaMemcpyDeviceToHost);
+  cudaMemcpy(
+      output1_data, output1_d_ptr, output_byte_size, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < 16; ++i) {
+    std::cout << input_data[i] << " + " << input_data[16 + i] << " = "
+              << output0_data[i] << std::endl;
+    std::cout << input_data[i] << " + " << input_data[16 + i] << " = "
+              << output1_data[i] << std::endl;
+    if ((input_data[i] + input_data[16 + i]) != output0_data[i]) {
+      std::cerr << "error: incorrect sum" << std::endl;
+      exit(1);
+    }
+    if ((input_data[i] - input_data[16 + i]) != output1_data[i]) {
+      std::cerr << "error: incorrect difference" << std::endl;
+      exit(1);
+    }
+  }
+  // Get shared memory regions active/registered within triton
+  inference::CudaSharedMemoryStatusResponse status;
+  FAIL_IF_ERR(
+      client->CudaSharedMemoryStatus(&status),
+      "failed to get shared memory status");
+  std::cout << "Shared Memory Status:\n" << status.DebugString() << "\n";
+  // Unregister shared memory
+  FAIL_IF_ERR(
+      client->UnregisterCudaSharedMemory("input_data"),
+      "unable to unregister shared memory input region");
+  FAIL_IF_ERR(
+      client->UnregisterCudaSharedMemory("output_data"),
+      "unable to unregister shared memory output region");
+  // Free GPU memory
+  FAIL_IF_CUDA_ERR(cudaFree(input_d_ptr));
+  FAIL_IF_CUDA_ERR(cudaFree(output0_d_ptr));
+  std::cout << "PASS : Cuda Shared Memory " << std::endl;
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_custom_args_client.cc
+++ b/src/c++/examples/simple_grpc_custom_args_client.cc
+// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <getopt.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-m <model name>" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-t <client timeout in microseconds>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  uint32_t client_timeout = 0;
+  bool use_ssl = false;
+  tc::SslOptions ssl_options;
+  grpc::ChannelArguments channel_args;
+  // Set any valid grpc::ChannelArguments here based on use case
+  channel_args.SetMaxSendMessageSize(1024 * 1024);
+  channel_args.SetMaxReceiveMessageSize(1024 * 1024);
+  // Setting KeepAlive options using new generic channel arguments option
+  // https://grpc.github.io/grpc/cpp/md_doc_keepalive.html
+  channel_args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, INT_MAX);
+  channel_args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 20000);
+  channel_args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, false);
+  channel_args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 2);
+  // Example arg requested for the feature
+  channel_args.SetInt(GRPC_ARG_DNS_ENABLE_SRV_QUERIES, 1);
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt_long(argc, argv, "vu:t:H:C:", NULL, NULL)) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 't':
+        client_timeout = std::stoi(optarg);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(
+          &client, url, channel_args, verbose, use_ssl, ssl_options),
+      "unable to create grpc client");
+  // Create the data for the two input tensors. Initialize the first
+  // to unique integers and the second to all ones.
+  std::vector<int32_t> input0_data(16);
+  std::vector<int32_t> input1_data(16);
+  for (size_t i = 0; i < 16; ++i) {
+    input0_data[i] = i;
+    input1_data[i] = 1;
+  }
+  std::vector<int64_t> shape{1, 16};
+  // Initialize the inputs with the data.
+  tc::InferInput* input0;
+  tc::InferInput* input1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+      "unable to get INPUT0");
+  std::shared_ptr<tc::InferInput> input0_ptr;
+  input0_ptr.reset(input0);
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+      "unable to get INPUT1");
+  std::shared_ptr<tc::InferInput> input1_ptr;
+  input1_ptr.reset(input1);
+  FAIL_IF_ERR(
+      input0_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input0_data[0]),
+          input0_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT0");
+  FAIL_IF_ERR(
+      input1_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input1_data[0]),
+          input1_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT1");
+  // Generate the outputs to be requested.
+  tc::InferRequestedOutput* output0;
+  tc::InferRequestedOutput* output1;
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+      "unable to get 'OUTPUT0'");
+  std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+  output0_ptr.reset(output0);
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+      "unable to get 'OUTPUT1'");
+  std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+  output1_ptr.reset(output1);
+  // The inference settings. Will be using default for now.
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  options.client_timeout_ = client_timeout;
+  std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {
+      output0_ptr.get(), output1_ptr.get()};
+  tc::InferResult* results;
+  FAIL_IF_ERR(
+      client->Infer(&results, options, inputs, outputs, http_headers),
+      "unable to run model");
+  std::shared_ptr<tc::InferResult> results_ptr;
+  results_ptr.reset(results);
+  // Validate the results...
+  ValidateShapeAndDatatype("OUTPUT0", results_ptr);
+  ValidateShapeAndDatatype("OUTPUT1", results_ptr);
+  // Get pointers to the result returned...
+  int32_t* output0_data;
+  size_t output0_byte_size;
+  FAIL_IF_ERR(
+      results_ptr->RawData(
+          "OUTPUT0", (const uint8_t**)&output0_data, &output0_byte_size),
+      "unable to get result data for 'OUTPUT0'");
+  if (output0_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT0': "
+              << output0_byte_size << std::endl;
+    exit(1);
+  }
+  int32_t* output1_data;
+  size_t output1_byte_size;
+  FAIL_IF_ERR(
+      results_ptr->RawData(
+          "OUTPUT1", (const uint8_t**)&output1_data, &output1_byte_size),
+      "unable to get result data for 'OUTPUT1'");
+  if (output1_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT1': "
+              << output1_byte_size << std::endl;
+    exit(1);
+  }
+  for (size_t i = 0; i < 16; ++i) {
+    std::cout << input0_data[i] << " + " << input1_data[i] << " = "
+              << *(output0_data + i) << std::endl;
+    std::cout << input0_data[i] << " - " << input1_data[i] << " = "
+              << *(output1_data + i) << std::endl;
+    if ((input0_data[i] + input1_data[i]) != *(output0_data + i)) {
+      std::cerr << "error: incorrect sum" << std::endl;
+      exit(1);
+    }
+    if ((input0_data[i] - input1_data[i]) != *(output1_data + i)) {
+      std::cerr << "error: incorrect difference" << std::endl;
+      exit(1);
+    }
+  }
+  // Get full response
+  std::cout << results_ptr->DebugString() << std::endl;
+  std::cout << "PASS : CustomArgs" << std::endl;
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_custom_repeat.cc
+++ b/src/c++/examples/simple_grpc_custom_repeat.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <unistd.h>
+#include <atomic>
+#include <condition_variable>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "grpc_client.h"
+namespace tc = triton::client;
+using ResultMap =
+    std::map<std::string, std::vector<std::shared_ptr<tc::InferResult>>>;
+using ResultList = std::vector<std::shared_ptr<tc::InferResult>>;
+// Global mutex to synchronize the threads
+std::mutex mutex_;
+std::condition_variable cv_;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-u <URL for inference service and its gRPC port>"
+            << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  std::cerr << "\t-r <the number of inference requests>" << std::endl;
+  std::cerr << "\t-s <the number of inference response to generate per request>"
+            << std::endl;
+  std::cerr << "\t-o <data offset>" << std::endl;
+  std::cerr << "\t-d <delay time between each response>" << std::endl;
+  std::cerr << "\t-w <wait time before releasing the request>" << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  int request_count = 1;
+  int repeat_count = 1;
+  int data_offset = 100;
+  uint32_t delay_time = 1000;
+  uint32_t wait_time = 500;
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vu:H:r:s:o:d:w:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case 'r':
+        request_count = std::stoi(optarg);
+        break;
+      case 's':
+        repeat_count = std::stoi(optarg);
+        break;
+      case 'o':
+        data_offset = std::stoi(optarg);
+        break;
+      case 'd':
+        delay_time = std::stoi(optarg);
+        break;
+      case 'w':
+        wait_time = std::stoi(optarg);
+        break;
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  tc::Error err;
+  // We use the custom "repeat_int32" model which takes 3 inputs and
+  // 1 output. For a single request the model will generate 'repeat_count'
+  // responses. See is src/backends/backend/examples/repeat.cc.
+  std::string model_name = "repeat_int32";
+  std::atomic<int32_t> received_response(0);
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(&client, url, verbose),
+      "unable to create grpc client");
+  ResultMap result_map;
+  // Note that client side statistics should be disabled in case of
+  // of decoupled model.
+  FAIL_IF_ERR(
+      client->StartStream(
+          [&](tc::InferResult* result) {
+            {
+              std::shared_ptr<tc::InferResult> result_ptr(result);
+              std::lock_guard<std::mutex> lk(mutex_);
+              std::string request_id;
+              result->Id(&request_id);
+              auto it = result_map.find(request_id);
+              if (it == result_map.end()) {
+                result_map[request_id] = ResultList();
+              }
+              result_map[request_id].push_back(result_ptr);
+              received_response++;
+            }
+            cv_.notify_all();
+          },
+          false /*enable_stats*/, 0 /* stream_timeout */, http_headers),
+      "unable to establish a streaming connection to server");
+  // Prepare the data for the tensors
+  std::vector<int32_t> in_data;
+  std::vector<uint32_t> delay_data;
+  std::vector<uint32_t> wait_data;
+  for (int i = 0; i < repeat_count; i++) {
+    in_data.push_back(data_offset + i);
+    delay_data.push_back(delay_time);
+  }
+  wait_data.push_back(wait_time);
+  // Initialize the inputs with the data.
+  tc::InferInput* in;
+  std::vector<int64_t> shape{repeat_count};
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&in, "IN", shape, "INT32"),
+      "unable to create 'IN'");
+  std::shared_ptr<tc::InferInput> in_ptr(in);
+  FAIL_IF_ERR(in_ptr->Reset(), "unable to reset 'IN'");
+  FAIL_IF_ERR(
+      in_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&in_data[0]),
+          sizeof(int32_t) * repeat_count),
+      "unable to set data for 'IN'");
+  tc::InferInput* delay;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&delay, "DELAY", shape, "UINT32"),
+      "unable to create 'DELAY'");
+  std::shared_ptr<tc::InferInput> delay_ptr(delay);
+  FAIL_IF_ERR(delay_ptr->Reset(), "unable to reset 'DELAY'");
+  FAIL_IF_ERR(
+      delay_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&delay_data[0]),
+          sizeof(uint32_t) * repeat_count),
+      "unable to set data for 'DELAY'");
+  tc::InferInput* wait;
+  shape[0] = 1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&wait, "WAIT", shape, "UINT32"),
+      "unable to create 'WAIT'");
+  std::shared_ptr<tc::InferInput> wait_ptr(wait);
+  FAIL_IF_ERR(wait_ptr->Reset(), "unable to reset 'WAIT'");
+  FAIL_IF_ERR(
+      wait_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&wait_data[0]), sizeof(uint32_t)),
+      "unable to set data for 'WAIT'");
+  std::vector<tc::InferInput*> inputs = {
+      in_ptr.get(), delay_ptr.get(), wait_ptr.get()};
+  tc::InferOptions options(model_name);
+  for (int id = 0; id < request_count; id++) {
+    options.request_id_ = std::to_string(id);
+    // Send inference request to the inference server.
+    FAIL_IF_ERR(
+        client->AsyncStreamInfer(options, inputs), "unable to run model");
+  }
+  // Wait until all callbacks are invoked
+  {
+    std::unique_lock<std::mutex> lk(mutex_);
+    cv_.wait(lk, [&]() {
+      if (received_response >= (repeat_count * request_count)) {
+        return true;
+      } else {
+        return false;
+      }
+    });
+  }
+  for (int i = 0; i < request_count; i++) {
+    std::string id(std::to_string(i));
+    if (repeat_count == 0) {
+      auto it = result_map.find(id);
+      if (it != result_map.end()) {
+        std::cerr << "received unexpected response for request id " << id
+                  << std::endl;
+        exit(1);
+      }
+    } else {
+      int32_t expected_output = data_offset;
+      auto it = result_map.find(id);
+      if (it == result_map.end()) {
+        std::cerr << "response for request id " << id << " not received"
+                  << std::endl;
+        exit(1);
+      }
+      if (it->second.size() != (uint32_t)repeat_count) {
+        std::cerr << "expected " << repeat_count << " many responses, got "
+                  << it->second.size() << std::endl;
+        exit(1);
+      }
+      for (auto this_result : it->second) {
+        int32_t* output_data;
+        size_t output_byte_size;
+        FAIL_IF_ERR(
+            this_result->RawData(
+                "OUT", (const uint8_t**)&output_data, &output_byte_size),
+            "unable to get result data for 'OUT'");
+        if (output_byte_size != 4) {
+          std::cerr << "error: received incorrect byte size for 'OUT': "
+                    << output_byte_size << std::endl;
+          exit(1);
+        }
+        if (*output_data != expected_output) {
+          std::cerr << "error: incorrect result returned, expected "
+                    << expected_output << ", got " << *output_data << std::endl;
+          exit(1);
+        }
+        expected_output++;
+      }
+    }
+  }
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_health_metadata.cc
+++ b/src/c++/examples/simple_grpc_health_metadata.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "vu:H:")) != -1) {
+    switch (opt) {
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(&client, url, verbose),
+      "unable to create grpc client");
+  bool live;
+  FAIL_IF_ERR(
+      client->IsServerLive(&live, http_headers),
+      "unable to get server liveness");
+  if (!live) {
+    std::cerr << "error: server is not live" << std::endl;
+    exit(1);
+  }
+  bool ready;
+  FAIL_IF_ERR(
+      client->IsServerReady(&ready, http_headers),
+      "unable to get server readiness");
+  if (!ready) {
+    std::cerr << "error: server is not live" << std::endl;
+    exit(1);
+  }
+  bool model_ready;
+  FAIL_IF_ERR(
+      client->IsModelReady(
+          &model_ready, model_name, model_version, http_headers),
+      "unable to get model readiness");
+  if (!model_ready) {
+    std::cerr << "error: model " << model_name << " is not live" << std::endl;
+    exit(1);
+  }
+  inference::ServerMetadataResponse server_metadata;
+  FAIL_IF_ERR(
+      client->ServerMetadata(&server_metadata, http_headers),
+      "unable to get server metadata");
+  if (server_metadata.name().compare("triton") != 0) {
+    std::cerr << "error: unexpected server metadata: "
+              << server_metadata.DebugString() << std::endl;
+    exit(1);
+  }
+  inference::ModelMetadataResponse model_metadata;
+  FAIL_IF_ERR(
+      client->ModelMetadata(
+          &model_metadata, model_name, model_version, http_headers),
+      "unable to get model metadata");
+  if (model_metadata.name().compare(model_name) != 0) {
+    std::cerr << "error: unexpected model metadata: "
+              << model_metadata.DebugString() << std::endl;
+    exit(1);
+  }
+  inference::ModelConfigResponse model_config;
+  FAIL_IF_ERR(
+      client->ModelConfig(
+          &model_config, model_name, model_version, http_headers),
+      "unable to get model config");
+  if (model_config.config().name().compare(model_name) != 0) {
+    std::cerr << "error: unexpected model config: "
+              << model_config.DebugString() << std::endl;
+    exit(1);
+  }
+  tc::Error err = client->ModelMetadata(
+      &model_metadata, "wrong_model_name", model_version, http_headers);
+  if (err.IsOk()) {
+    std::cerr << "error: expected an error but got: " << err << std::endl;
+    exit(1);
+  }
+  std::cout << err << std::endl;
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_infer_client.cc
+++ b/src/c++/examples/simple_grpc_infer_client.cc
+// Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <getopt.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-m <model name>" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-t <client timeout in microseconds>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr
+      << "\tFor -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  std::cerr << "\t-C <grpc compression algorithm>. \'deflate\', "
+               "\'gzip\' and \'none\' are supported"
+            << std::endl;
+  std::cerr << "\t-c <use_cached_channel>. "
+               " Use cached channel when creating new client. "
+               " Specify 'true' or 'false'. True by default"
+            << std::endl;
+  std::cerr << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  uint32_t client_timeout = 0;
+  bool use_ssl = false;
+  std::string root_certificates;
+  std::string private_key;
+  std::string certificate_chain;
+  grpc_compression_algorithm compression_algorithm =
+      grpc_compression_algorithm::GRPC_COMPRESS_NONE;
+  bool test_use_cached_channel = false;
+  bool use_cached_channel = true;
+  // {name, has_arg, *flag, val}
+  static struct option long_options[] = {
+      {"ssl", 0, 0, 0},
+      {"root-certificates", 1, 0, 1},
+      {"private-key", 1, 0, 2},
+      {"certificate-chain", 1, 0, 3}};
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt_long(argc, argv, "vu:t:H:C:c:", long_options, NULL)) !=
+         -1) {
+    switch (opt) {
+      case 0:
+        use_ssl = true;
+        break;
+      case 1:
+        root_certificates = optarg;
+        break;
+      case 2:
+        private_key = optarg;
+        break;
+      case 3:
+        certificate_chain = optarg;
+        break;
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 't':
+        client_timeout = std::stoi(optarg);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        if (header.size() == arg.size() || header.empty()) {
+          Usage(
+              argv,
+              "HTTP header specified incorrectly. Must be formmated as "
+              "'Header:Value'");
+        } else {
+          http_headers[header] = arg.substr(header.size() + 1);
+        }
+        break;
+      }
+      case 'C': {
+        std::string algorithm_str{optarg};
+        if (algorithm_str.compare("deflate") == 0) {
+          compression_algorithm =
+              grpc_compression_algorithm::GRPC_COMPRESS_DEFLATE;
+        } else if (algorithm_str.compare("gzip") == 0) {
+          compression_algorithm =
+              grpc_compression_algorithm::GRPC_COMPRESS_GZIP;
+        } else if (algorithm_str.compare("none") == 0) {
+          compression_algorithm =
+              grpc_compression_algorithm::GRPC_COMPRESS_NONE;
+        } else {
+          Usage(
+              argv,
+              "unsupported compression algorithm specified... only "
+              "\'deflate\', "
+              "\'gzip\' and \'none\' are supported.");
+        }
+        break;
+      }
+      case 'c': {
+        test_use_cached_channel = true;
+        std::string arg = optarg;
+        if (arg.find("false") != std::string::npos) {
+          use_cached_channel = false;
+        } else if (arg.find("true") != std::string::npos) {
+          use_cached_channel = true;
+        } else {
+          Usage(argv, "need to specify true or false for use_cached_channel");
+        }
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  tc::SslOptions ssl_options = tc::SslOptions();
+  std::string err;
+  if (use_ssl) {
+    ssl_options.root_certificates = root_certificates;
+    ssl_options.private_key = private_key;
+    ssl_options.certificate_chain = certificate_chain;
+    err = "unable to create secure grpc client";
+  } else {
+    err = "unable to create grpc client";
+  }
+  // Run with the same name to ensure cached channel is not used
+  int numRuns = test_use_cached_channel ? 2 : 1;
+  for (int i = 0; i < numRuns; ++i) {
+    FAIL_IF_ERR(
+        tc::InferenceServerGrpcClient::Create(
+            &client, url, verbose, use_ssl, ssl_options, tc::KeepAliveOptions(),
+            use_cached_channel),
+        err);
+    // Create the data for the two input tensors. Initialize the first
+    // to unique integers and the second to all ones.
+    std::vector<int32_t> input0_data(16);
+    std::vector<int32_t> input1_data(16);
+    for (size_t i = 0; i < 16; ++i) {
+      input0_data[i] = i;
+      input1_data[i] = 1;
+    }
+    std::vector<int64_t> shape{1, 16};
+    // Initialize the inputs with the data.
+    tc::InferInput* input0;
+    tc::InferInput* input1;
+    FAIL_IF_ERR(
+        tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+        "unable to get INPUT0");
+    std::shared_ptr<tc::InferInput> input0_ptr;
+    input0_ptr.reset(input0);
+    FAIL_IF_ERR(
+        tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+        "unable to get INPUT1");
+    std::shared_ptr<tc::InferInput> input1_ptr;
+    input1_ptr.reset(input1);
+    FAIL_IF_ERR(
+        input0_ptr->AppendRaw(
+            reinterpret_cast<uint8_t*>(&input0_data[0]),
+            input0_data.size() * sizeof(int32_t)),
+        "unable to set data for INPUT0");
+    FAIL_IF_ERR(
+        input1_ptr->AppendRaw(
+            reinterpret_cast<uint8_t*>(&input1_data[0]),
+            input1_data.size() * sizeof(int32_t)),
+        "unable to set data for INPUT1");
+    // Generate the outputs to be requested.
+    tc::InferRequestedOutput* output0;
+    tc::InferRequestedOutput* output1;
+    FAIL_IF_ERR(
+        tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+        "unable to get 'OUTPUT0'");
+    std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+    output0_ptr.reset(output0);
+    FAIL_IF_ERR(
+        tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+        "unable to get 'OUTPUT1'");
+    std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+    output1_ptr.reset(output1);
+    // The inference settings. Will be using default for now.
+    tc::InferOptions options(model_name);
+    options.model_version_ = model_version;
+    options.client_timeout_ = client_timeout;
+    std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+    std::vector<const tc::InferRequestedOutput*> outputs = {
+        output0_ptr.get(), output1_ptr.get()};
+    tc::InferResult* results;
+    FAIL_IF_ERR(
+        client->Infer(
+            &results, options, inputs, outputs, http_headers,
+            compression_algorithm),
+        "unable to run model");
+    std::shared_ptr<tc::InferResult> results_ptr;
+    results_ptr.reset(results);
+    // Validate the results...
+    ValidateShapeAndDatatype("OUTPUT0", results_ptr);
+    ValidateShapeAndDatatype("OUTPUT1", results_ptr);
+    // Get pointers to the result returned...
+    int32_t* output0_data;
+    size_t output0_byte_size;
+    FAIL_IF_ERR(
+        results_ptr->RawData(
+            "OUTPUT0", (const uint8_t**)&output0_data, &output0_byte_size),
+        "unable to get result data for 'OUTPUT0'");
+    if (output0_byte_size != 64) {
+      std::cerr << "error: received incorrect byte size for 'OUTPUT0': "
+                << output0_byte_size << std::endl;
+      exit(1);
+    }
+    int32_t* output1_data;
+    size_t output1_byte_size;
+    FAIL_IF_ERR(
+        results_ptr->RawData(
+            "OUTPUT1", (const uint8_t**)&output1_data, &output1_byte_size),
+        "unable to get result data for 'OUTPUT1'");
+    if (output1_byte_size != 64) {
+      std::cerr << "error: received incorrect byte size for 'OUTPUT1': "
+                << output1_byte_size << std::endl;
+      exit(1);
+    }
+    for (size_t i = 0; i < 16; ++i) {
+      std::cout << input0_data[i] << " + " << input1_data[i] << " = "
+                << *(output0_data + i) << std::endl;
+      std::cout << input0_data[i] << " - " << input1_data[i] << " = "
+                << *(output1_data + i) << std::endl;
+      if ((input0_data[i] + input1_data[i]) != *(output0_data + i)) {
+        std::cerr << "error: incorrect sum" << std::endl;
+        exit(1);
+      }
+      if ((input0_data[i] - input1_data[i]) != *(output1_data + i)) {
+        std::cerr << "error: incorrect difference" << std::endl;
+        exit(1);
+      }
+    }
+    // Get full response
+    std::cout << results_ptr->DebugString() << std::endl;
+    tc::InferStat infer_stat;
+    client->ClientInferStat(&infer_stat);
+    std::cout << "======Client Statistics======" << std::endl;
+    std::cout << "completed_request_count "
+              << infer_stat.completed_request_count << std::endl;
+    std::cout << "cumulative_total_request_time_ns "
+              << infer_stat.cumulative_total_request_time_ns << std::endl;
+    std::cout << "cumulative_send_time_ns "
+              << infer_stat.cumulative_send_time_ns << std::endl;
+    std::cout << "cumulative_receive_time_ns "
+              << infer_stat.cumulative_receive_time_ns << std::endl;
+    inference::ModelStatisticsResponse model_stat;
+    client->ModelInferenceStatistics(&model_stat, model_name);
+    std::cout << "======Model Statistics======" << std::endl;
+    std::cout << model_stat.DebugString() << std::endl;
+    std::cout << "PASS : Infer" << std::endl;
+  }
+  return 0;
+}
--- a/src/c++/examples/simple_grpc_keepalive_client.cc
+++ b/src/c++/examples/simple_grpc_keepalive_client.cc
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#include <getopt.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include "grpc_client.h"
+namespace tc = triton::client;
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    tc::Error err = (X);                                           \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+namespace {
+void
+ValidateShapeAndDatatype(
+    const std::string& name, std::shared_ptr<tc::InferResult> result)
+{
+  std::vector<int64_t> shape;
+  FAIL_IF_ERR(
+      result->Shape(name, &shape), "unable to get shape for '" + name + "'");
+  // Validate shape
+  if ((shape.size() != 2) || (shape[0] != 1) || (shape[1] != 16)) {
+    std::cerr << "error: received incorrect shapes for '" << name << "'"
+              << std::endl;
+    exit(1);
+  }
+  std::string datatype;
+  FAIL_IF_ERR(
+      result->Datatype(name, &datatype),
+      "unable to get datatype for '" + name + "'");
+  // Validate datatype
+  if (datatype.compare("INT32") != 0) {
+    std::cerr << "error: received incorrect datatype for '" << name
+              << "': " << datatype << std::endl;
+    exit(1);
+  }
+}
+void
+Usage(char** argv, const std::string& msg = std::string())
+{
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+  std::cerr << "Usage: " << argv[0] << " [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-m <model name>" << std::endl;
+  std::cerr << "\t-u <URL for inference service>" << std::endl;
+  std::cerr << "\t-t <client timeout in microseconds>" << std::endl;
+  std::cerr << "\t-H <HTTP header>" << std::endl;
+  std::cerr << "\t--grpc-keepalive-time <milliseconds>" << std::endl;
+  std::cerr << "\t--grpc-keepalive-timeout <milliseconds>" << std::endl;
+  std::cerr << "\t--grpc-keepalive-permit-without-calls" << std::endl;
+  std::cerr << "\t--grpc-http2-max-pings-without-data <number of pings>"
+            << std::endl;
+  std::cerr << std::endl;
+  std::cerr
+      << "For -H, header must be 'Header:Value'. May be given multiple times."
+      << std::endl;
+  exit(1);
+}
+}  // namespace
+int
+main(int argc, char** argv)
+{
+  bool verbose = false;
+  std::string url("localhost:8001");
+  tc::Headers http_headers;
+  uint32_t client_timeout = 0;
+  bool use_ssl = false;
+  tc::SslOptions ssl_options;
+  tc::KeepAliveOptions keepalive_options;
+  // GRPC KeepAlive: https://grpc.github.io/grpc/cpp/md_doc_keepalive.html
+  int keepalive_time_ms = INT_MAX;
+  int keepalive_timeout_ms = 20000;
+  bool keepalive_permit_without_calls = false;
+  int http2_max_pings_without_data = 2;
+  // {name, has_arg, *flag, val}
+  static struct option long_options[] = {
+      {"grpc-keepalive-time", 1, 0, 0},
+      {"grpc-keepalive-timeout", 1, 0, 1},
+      {"grpc-keepalive-permit-without-calls", 0, 0, 2},
+      {"grpc-http2-max-pings-without-data", 1, 0, 3}};
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt_long(argc, argv, "vu:t:H:C:", long_options, NULL)) !=
+         -1) {
+    switch (opt) {
+      case 0:
+        keepalive_options.keepalive_time_ms = std::stoi(optarg);
+        break;
+      case 1:
+        keepalive_options.keepalive_timeout_ms = std::stoi(optarg);
+        break;
+      case 2:
+        keepalive_options.keepalive_permit_without_calls = true;
+        break;
+      case 3:
+        keepalive_options.http2_max_pings_without_data = std::stoi(optarg);
+        break;
+      case 'v':
+        verbose = true;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 't':
+        client_timeout = std::stoi(optarg);
+        break;
+      case 'H': {
+        std::string arg = optarg;
+        std::string header = arg.substr(0, arg.find(":"));
+        http_headers[header] = arg.substr(header.size() + 1);
+        break;
+      }
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+  // We use a simple model that takes 2 input tensors of 16 integers
+  // each and returns 2 output tensors of 16 integers each. One output
+  // tensor is the element-wise sum of the inputs and one output is
+  // the element-wise difference.
+  std::string model_name = "simple";
+  std::string model_version = "";
+  // Create a InferenceServerGrpcClient instance to communicate with the
+  // server using gRPC protocol.
+  std::unique_ptr<tc::InferenceServerGrpcClient> client;
+  FAIL_IF_ERR(
+      tc::InferenceServerGrpcClient::Create(
+          &client, url, verbose, use_ssl, ssl_options, keepalive_options),
+      "unable to create grpc client");
+  // Create the data for the two input tensors. Initialize the first
+  // to unique integers and the second to all ones.
+  std::vector<int32_t> input0_data(16);
+  std::vector<int32_t> input1_data(16);
+  for (size_t i = 0; i < 16; ++i) {
+    input0_data[i] = i;
+    input1_data[i] = 1;
+  }
+  std::vector<int64_t> shape{1, 16};
+  // Initialize the inputs with the data.
+  tc::InferInput* input0;
+  tc::InferInput* input1;
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input0, "INPUT0", shape, "INT32"),
+      "unable to get INPUT0");
+  std::shared_ptr<tc::InferInput> input0_ptr;
+  input0_ptr.reset(input0);
+  FAIL_IF_ERR(
+      tc::InferInput::Create(&input1, "INPUT1", shape, "INT32"),
+      "unable to get INPUT1");
+  std::shared_ptr<tc::InferInput> input1_ptr;
+  input1_ptr.reset(input1);
+  FAIL_IF_ERR(
+      input0_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input0_data[0]),
+          input0_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT0");
+  FAIL_IF_ERR(
+      input1_ptr->AppendRaw(
+          reinterpret_cast<uint8_t*>(&input1_data[0]),
+          input1_data.size() * sizeof(int32_t)),
+      "unable to set data for INPUT1");
+  // Generate the outputs to be requested.
+  tc::InferRequestedOutput* output0;
+  tc::InferRequestedOutput* output1;
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output0, "OUTPUT0"),
+      "unable to get 'OUTPUT0'");
+  std::shared_ptr<tc::InferRequestedOutput> output0_ptr;
+  output0_ptr.reset(output0);
+  FAIL_IF_ERR(
+      tc::InferRequestedOutput::Create(&output1, "OUTPUT1"),
+      "unable to get 'OUTPUT1'");
+  std::shared_ptr<tc::InferRequestedOutput> output1_ptr;
+  output1_ptr.reset(output1);
+  // The inference settings. Will be using default for now.
+  tc::InferOptions options(model_name);
+  options.model_version_ = model_version;
+  options.client_timeout_ = client_timeout;
+  std::vector<tc::InferInput*> inputs = {input0_ptr.get(), input1_ptr.get()};
+  std::vector<const tc::InferRequestedOutput*> outputs = {
+      output0_ptr.get(), output1_ptr.get()};
+  tc::InferResult* results;
+  FAIL_IF_ERR(
+      client->Infer(&results, options, inputs, outputs, http_headers),
+      "unable to run model");
+  std::shared_ptr<tc::InferResult> results_ptr;
+  results_ptr.reset(results);
+  // Validate the results...
+  ValidateShapeAndDatatype("OUTPUT0", results_ptr);
+  ValidateShapeAndDatatype("OUTPUT1", results_ptr);
+  // Get pointers to the result returned...
+  int32_t* output0_data;
+  size_t output0_byte_size;
+  FAIL_IF_ERR(
+      results_ptr->RawData(
+          "OUTPUT0", (const uint8_t**)&output0_data, &output0_byte_size),
+      "unable to get result data for 'OUTPUT0'");
+  if (output0_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT0': "
+              << output0_byte_size << std::endl;
+    exit(1);
+  }
+  int32_t* output1_data;
+  size_t output1_byte_size;
+  FAIL_IF_ERR(
+      results_ptr->RawData(
+          "OUTPUT1", (const uint8_t**)&output1_data, &output1_byte_size),
+      "unable to get result data for 'OUTPUT1'");
+  if (output1_byte_size != 64) {
+    std::cerr << "error: received incorrect byte size for 'OUTPUT1': "
+              << output1_byte_size << std::endl;
+    exit(1);
+  }
+  for (size_t i = 0; i < 16; ++i) {
+    std::cout << input0_data[i] << " + " << input1_data[i] << " = "
+              << *(output0_data + i) << std::endl;
+    std::cout << input0_data[i] << " - " << input1_data[i] << " = "
+              << *(output1_data + i) << std::endl;
+    if ((input0_data[i] + input1_data[i]) != *(output0_data + i)) {
+      std::cerr << "error: incorrect sum" << std::endl;
+      exit(1);
+    }
+    if ((input0_data[i] - input1_data[i]) != *(output1_data + i)) {
+      std::cerr << "error: incorrect difference" << std::endl;
+      exit(1);
+    }
+  }
+  // Get full response
+  std::cout << results_ptr->DebugString() << std::endl;
+  std::cout << "PASS : KeepAlive" << std::endl;
+  return 0;
+}