add new model resnet50v1.5

e129194a · Sugon_ldc · e129194a · e129194a · e129194a · e129194a
Commit e129194a authored Sep 26, 2023 by Sugon_ldc
17 changed files
--- a/triton/run_inference_on_triton.py
+++ b/triton/run_inference_on_triton.py
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+To infer the model deployed on Triton, you can use `run_inference_on_triton.py` script.
+It sends a request with data obtained from pointed data loader and dumps received data into npz files.
+Those files are stored in directory pointed by `--output-dir` argument.
+
+Currently, the client communicates with the Triton server asynchronously using GRPC protocol.
+
+Example call:
+
+```shell script
+python ./triton/run_inference_on_triton.py \
+    --server-url localhost:8001 \
+    --model-name ResNet50 \
+    --model-version 1 \
+    --dump-labels \
+    --output-dir /results/dump_triton
+```
+"""
+
+import argparse
+import functools
+import logging
+import queue
+import threading
+import time
+from pathlib import Path
+from typing import Optional
+
+from tqdm import tqdm
+
+# pytype: disable=import-error
+try:
+    from tritonclient import utils as client_utils  # noqa: F401
+    from tritonclient.grpc import (
+        InferenceServerClient,
+        InferInput,
+        InferRequestedOutput,
+    )
+except ImportError:
+    import tritongrpcclient as grpc_client
+    from tritongrpcclient import (
+        InferenceServerClient,
+        InferInput,
+        InferRequestedOutput,
+    )
+# pytype: enable=import-error
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.args import ArgParserGenerator
+from .deployment_toolkit.core import DATALOADER_FN_NAME, load_from_file
+from .deployment_toolkit.dump import NpzWriter
+
+LOGGER = logging.getLogger("run_inference_on_triton")
+
+
+class AsyncGRPCTritonRunner:
+    DEFAULT_MAX_RESP_WAIT_S = 120
+    DEFAULT_MAX_UNRESP_REQS = 128
+    DEFAULT_MAX_FINISH_WAIT_S = 900  # 15min
+
+    def __init__(
+        self,
+        server_url: str,
+        model_name: str,
+        model_version: str,
+        *,
+        dataloader,
+        verbose=False,
+        resp_wait_s: Optional[float] = None,
+        max_unresponded_reqs: Optional[int] = None,
+    ):
+        self._server_url = server_url
+        self._model_name = model_name
+        self._model_version = model_version
+        self._dataloader = dataloader
+        self._verbose = verbose
+        self._response_wait_t = self.DEFAULT_MAX_RESP_WAIT_S if resp_wait_s is None else resp_wait_s
+        self._max_unresp_reqs = self.DEFAULT_MAX_UNRESP_REQS if max_unresponded_reqs is None else max_unresponded_reqs
+
+        self._results = queue.Queue()
+        self._processed_all = False
+        self._errors = []
+        self._num_waiting_for = 0
+        self._sync = threading.Condition()
+        self._req_thread = threading.Thread(target=self.req_loop, daemon=True)
+
+    def __iter__(self):
+        self._req_thread.start()
+        timeout_s = 0.050  # check flags processed_all and error flags every 50ms
+        while True:
+            try:
+                ids, x, y_pred, y_real = self._results.get(timeout=timeout_s)
+                yield ids, x, y_pred, y_real
+            except queue.Empty:
+                shall_stop = self._processed_all or self._errors
+                if shall_stop:
+                    break
+
+        LOGGER.debug("Waiting for request thread to stop")
+        self._req_thread.join()
+        if self._errors:
+            error_msg = "\n".join(map(str, self._errors))
+            raise RuntimeError(error_msg)
+
+    def _on_result(self, ids, x, y_real, output_names, result, error):
+        with self._sync:
+            if error:
+                self._errors.append(error)
+            else:
+                y_pred = {name: result.as_numpy(name) for name in output_names}
+                self._results.put((ids, x, y_pred, y_real))
+            self._num_waiting_for -= 1
+            self._sync.notify_all()
+
+    def req_loop(self):
+        client = InferenceServerClient(self._server_url, verbose=self._verbose)
+        self._errors = self._verify_triton_state(client)
+        if self._errors:
+            return
+
+        LOGGER.debug(
+            f"Triton server {self._server_url} and model {self._model_name}:{self._model_version} " f"are up and ready!"
+        )
+
+        model_config = client.get_model_config(self._model_name, self._model_version)
+        model_metadata = client.get_model_metadata(self._model_name, self._model_version)
+        LOGGER.info(f"Model config {model_config}")
+        LOGGER.info(f"Model metadata {model_metadata}")
+
+        inputs = {tm.name: tm for tm in model_metadata.inputs}
+        outputs = {tm.name: tm for tm in model_metadata.outputs}
+        output_names = list(outputs)
+        outputs_req = [InferRequestedOutput(name) for name in outputs]
+
+        self._num_waiting_for = 0
+
+        for ids, x, y_real in self._dataloader:
+            infer_inputs = []
+            for name in inputs:
+                data = x[name]
+                infer_input = InferInput(name, data.shape, inputs[name].datatype)
+
+                target_np_dtype = client_utils.triton_to_np_dtype(inputs[name].datatype)
+                data = data.astype(target_np_dtype)
+
+                infer_input.set_data_from_numpy(data)
+                infer_inputs.append(infer_input)
+
+            with self._sync:
+
+                def _check_can_send():
+                    return self._num_waiting_for < self._max_unresp_reqs
+
+                can_send = self._sync.wait_for(_check_can_send, timeout=self._response_wait_t)
+                if not can_send:
+                    error_msg = f"Runner could not send new requests for {self._response_wait_t}s"
+                    self._errors.append(error_msg)
+                    break
+
+                callback = functools.partial(AsyncGRPCTritonRunner._on_result, self, ids, x, y_real, output_names)
+                client.async_infer(
+                    model_name=self._model_name,
+                    model_version=self._model_version,
+                    inputs=infer_inputs,
+                    outputs=outputs_req,
+                    callback=callback,
+                )
+                self._num_waiting_for += 1
+
+        # wait till receive all requested data
+        with self._sync:
+
+            def _all_processed():
+                LOGGER.debug(f"wait for {self._num_waiting_for} unprocessed jobs")
+                return self._num_waiting_for == 0
+
+            self._processed_all = self._sync.wait_for(_all_processed, self.DEFAULT_MAX_FINISH_WAIT_S)
+            if not self._processed_all:
+                error_msg = f"Runner {self._response_wait_t}s timeout received while waiting for results from server"
+                self._errors.append(error_msg)
+        LOGGER.debug("Finished request thread")
+
+    def _verify_triton_state(self, triton_client):
+        errors = []
+        if not triton_client.is_server_live():
+            errors.append(f"Triton server {self._server_url} is not live")
+        elif not triton_client.is_server_ready():
+            errors.append(f"Triton server {self._server_url} is not ready")
+        elif not triton_client.is_model_ready(self._model_name, self._model_version):
+            errors.append(f"Model {self._model_name}:{self._model_version} is not ready")
+        return errors
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(description="Infer model on Triton server", allow_abbrev=False)
+    parser.add_argument(
+        "--server-url", type=str, default="localhost:8001", help="Inference server URL (default localhost:8001)"
+    )
+    parser.add_argument("--model-name", help="The name of the model used for inference.", required=True)
+    parser.add_argument("--model-version", help="The version of the model used for inference.", required=True)
+    parser.add_argument("--dataloader", help="Path to python file containing dataloader.", required=True)
+    parser.add_argument("--dump-labels", help="Dump labels to output dir", action="store_true", default=False)
+    parser.add_argument("--dump-inputs", help="Dump inputs to output dir", action="store_true", default=False)
+    parser.add_argument("-v", "--verbose", help="Verbose logs", action="store_true", default=False)
+    parser.add_argument("--output-dir", required=True, help="Path to directory where outputs will be saved")
+    parser.add_argument("--response-wait-time", required=False, help="Maximal time to wait for response", default=120)
+    parser.add_argument(
+        "--max-unresponded-requests", required=False, help="Maximal number of unresponded requests", default=128
+    )
+
+    args, *_ = parser.parse_known_args()
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    ArgParserGenerator(get_dataloader_fn).update_argparser(parser)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = _parse_args()
+
+    log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
+    log_level = logging.INFO if not args.verbose else logging.DEBUG
+    logging.basicConfig(level=log_level, format=log_format)
+
+    LOGGER.info(f"args:")
+    for key, value in vars(args).items():
+        LOGGER.info(f"    {key} = {value}")
+
+    get_dataloader_fn = load_from_file(args.dataloader, label="dataloader", target=DATALOADER_FN_NAME)
+    dataloader_fn = ArgParserGenerator(get_dataloader_fn).from_args(args)
+
+    runner = AsyncGRPCTritonRunner(
+        args.server_url,
+        args.model_name,
+        args.model_version,
+        dataloader=dataloader_fn(),
+        verbose=False,
+        resp_wait_s=args.response_wait_time,
+        max_unresponded_reqs=args.max_unresponded_requests,
+    )
+
+    with NpzWriter(output_dir=args.output_dir) as writer:
+        start = time.time()
+        for ids, x, y_pred, y_real in tqdm(runner, unit="batch", mininterval=10):
+            data = _verify_and_format_dump(args, ids, x, y_pred, y_real)
+            writer.write(**data)
+        stop = time.time()
+
+    LOGGER.info(f"\nThe inference took {stop - start:0.3f}s")
+
+
+def _verify_and_format_dump(args, ids, x, y_pred, y_real):
+    data = {"outputs": y_pred, "ids": {"ids": ids}}
+    if args.dump_inputs:
+        data["inputs"] = x
+    if args.dump_labels:
+        if not y_real:
+            raise ValueError(
+                "Found empty label values. Please provide labels in dataloader_fn or do not use --dump-labels argument"
+            )
+        data["labels"] = y_real
+    return data
+
+
+if __name__ == "__main__":
+    main()
--- a/triton/run_offline_performance_test_on_triton.py
+++ b/triton/run_offline_performance_test_on_triton.py
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+For models with variable-sized inputs you must provide the --input-shape argument so that perf_analyzer knows
+what shape tensors to use. For example, for a model that has an input called IMAGE that has shape [ 3, N, M ],
+where N and M are variable-size dimensions, to tell perf_analyzer to send batch-size 4 requests of shape [ 3, 224, 224 ]
+`--shape IMAGE:3,224,224`.
+"""
+
+import argparse
+import csv
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.report import save_results, show_results, sort_results
+from .deployment_toolkit.warmup import warmup
+
+
+def calculate_average_latency(r):
+    avg_sum_fields = [
+        "Client Send",
+        "Network+Server Send/Recv",
+        "Server Queue",
+        "Server Compute",
+        "Server Compute Input",
+        "Server Compute Infer",
+        "Server Compute Output",
+        "Client Recv",
+    ]
+    avg_latency = sum([int(r.get(f, 0)) for f in avg_sum_fields])
+
+    return avg_latency
+
+
+def update_performance_data(results: List, batch_size: int, performance_partial_file: str):
+    row: Dict = {"batch_size": batch_size}
+    with open(performance_partial_file, "r") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for r in reader:
+            avg_latency = calculate_average_latency(r)
+            row = {**row, **r, "avg latency": avg_latency}
+
+    results.append(row)
+
+
+def _parse_batch_sizes(batch_sizes: str):
+    batches = batch_sizes.split(sep=",")
+    return list(map(lambda x: int(x.strip()), batches))
+
+
+def offline_performance(
+        model_name: str,
+        batch_sizes: List[int],
+        result_path: str,
+        input_shapes: Optional[List[str]] = None,
+        profiling_data: str = "random",
+        triton_instances: int = 1,
+        server_url: str = "localhost",
+        measurement_window: int = 10000,
+        shared_memory: bool = False
+):
+    print("\n")
+    print(f"==== Static batching analysis start ====")
+    print("\n")
+
+    input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
+
+    results: List[Dict] = list()
+    for batch_size in batch_sizes:
+        print(f"Running performance tests for batch size: {batch_size}")
+        performance_partial_file = f"triton_performance_partial_{batch_size}.csv"
+
+        exec_args = f"""-max-threads {triton_instances} \
+           -m {model_name} \
+           -x 1 \
+           -c {triton_instances} \
+           -t {triton_instances} \
+           -p {measurement_window} \
+           -v \
+           -i http \
+           -u {server_url}:8000 \
+           -b {batch_size} \
+           -f {performance_partial_file} \
+           --input-data {profiling_data} {input_shapes}"""
+
+        if shared_memory:
+            exec_args += " --shared-memory=cuda"
+
+        result = os.system(f"perf_client {exec_args}")
+        if result != 0:
+            print(f"Failed running performance tests. Perf client failed with exit code {result}")
+            sys.exit(1)
+
+        update_performance_data(results, batch_size, performance_partial_file)
+        os.remove(performance_partial_file)
+
+    results = sort_results(results=results)
+
+    save_results(filename=result_path, data=results)
+    show_results(results=results)
+
+    print("Performance results for static batching stored in: {0}".format(result_path))
+
+    print("\n")
+    print(f"==== Analysis done ====")
+    print("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, required=True, help="Name of the model to test")
+    parser.add_argument(
+        "--input-data", type=str, required=False, default="random", help="Input data to perform profiling."
+    )
+    parser.add_argument(
+        "--input-shape",
+        action="append",
+        required=False,
+        help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
+    )
+    parser.add_argument("--batch-sizes", type=str, required=True, help="List of batch sizes to tests. Comma separated.")
+    parser.add_argument("--result-path", type=str, required=True, help="Path where result file is going to be stored.")
+    parser.add_argument("--triton-instances", type=int, default=1, help="Number of Triton Server instances")
+    parser.add_argument("--server-url", type=str, required=False, default="localhost", help="Url to Triton server")
+    parser.add_argument(
+        "--measurement-window", required=False, help="Time which perf_analyzer will wait for results", default=10000
+    )
+    parser.add_argument("--shared-memory", help="Use shared memory for communication with Triton", action="store_true",
+                        default=False)
+
+    args = parser.parse_args()
+
+    warmup(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        batch_sizes=_parse_batch_sizes(args.batch_sizes),
+        triton_instances=args.triton_instances,
+        profiling_data=args.input_data,
+        input_shapes=args.input_shape,
+        measurement_window=args.measurement_window,
+        shared_memory=args.shared_memory
+    )
+
+    offline_performance(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        batch_sizes=_parse_batch_sizes(args.batch_sizes),
+        triton_instances=args.triton_instances,
+        profiling_data=args.input_data,
+        input_shapes=args.input_shape,
+        result_path=args.result_path,
+        measurement_window=args.measurement_window,
+        shared_memory=args.shared_memory
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/triton/run_online_performance_test_on_triton.py
+++ b/triton/run_online_performance_test_on_triton.py
+#!/usr/bin/env python3
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""
+For models with variable-sized inputs you must provide the --input-shape argument so that perf_analyzer knows
+what shape tensors to use. For example, for a model that has an input called IMAGE that has shape [ 3, N, M ],
+where N and M are variable-size dimensions, to tell perf_analyzer to send batch-size 4 requests of shape [ 3, 224, 224 ]
+`--shape IMAGE:3,224,224`.
+"""
+
+import argparse
+import csv
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional
+
+# method from PEP-366 to support relative import in executed modules
+if __package__ is None:
+    __package__ = Path(__file__).parent.name
+
+from .deployment_toolkit.report import save_results, show_results, sort_results
+from .deployment_toolkit.warmup import warmup
+
+
+def calculate_average_latency(r):
+    avg_sum_fields = [
+        "Client Send",
+        "Network+Server Send/Recv",
+        "Server Queue",
+        "Server Compute",
+        "Server Compute Input",
+        "Server Compute Infer",
+        "Server Compute Output",
+        "Client Recv",
+    ]
+    avg_latency = sum([int(r.get(f, 0)) for f in avg_sum_fields])
+
+    return avg_latency
+
+
+def update_performance_data(results: List, performance_file: str):
+    with open(performance_file, "r") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            row["avg latency"] = calculate_average_latency(row)
+
+            results.append(row)
+
+
+def _parse_batch_sizes(batch_sizes: str):
+    batches = batch_sizes.split(sep=",")
+    return list(map(lambda x: int(x.strip()), batches))
+
+
+def online_performance(
+        model_name: str,
+        batch_sizes: List[int],
+        result_path: str,
+        input_shapes: Optional[List[str]] = None,
+        profiling_data: str = "random",
+        triton_instances: int = 1,
+        triton_gpu_engine_count: int = 1,
+        server_url: str = "localhost",
+        measurement_window: int = 10000,
+        shared_memory: bool = False
+):
+    print("\n")
+    print(f"==== Dynamic batching analysis start ====")
+    print("\n")
+
+    input_shapes = " ".join(map(lambda shape: f" --shape {shape}", input_shapes)) if input_shapes else ""
+
+    print(f"Running performance tests for dynamic batching")
+    performance_file = f"triton_performance_dynamic_partial.csv"
+
+    max_batch_size = max(batch_sizes)
+    max_total_requests = 2 * max_batch_size * triton_instances * triton_gpu_engine_count
+    max_concurrency = min(256, max_total_requests)
+    batch_size = max(1, max_total_requests // 256)
+
+    step = max(1, max_concurrency // 32)
+    min_concurrency = step
+
+    exec_args = f"""-m {model_name} \
+        -x 1 \
+        -p {measurement_window} \
+        -v \
+        -i http \
+        -u {server_url}:8000 \
+        -b {batch_size} \
+        -f {performance_file} \
+        --concurrency-range {min_concurrency}:{max_concurrency}:{step} \
+        --input-data {profiling_data} {input_shapes}"""
+
+    if shared_memory:
+        exec_args += " --shared-memory=cuda"
+
+    result = os.system(f"perf_client {exec_args}")
+    if result != 0:
+        print(f"Failed running performance tests. Perf client failed with exit code {result}")
+        sys.exit(1)
+
+    results = list()
+    update_performance_data(results=results, performance_file=performance_file)
+
+    results = sort_results(results=results)
+
+    save_results(filename=result_path, data=results)
+    show_results(results=results)
+
+    os.remove(performance_file)
+
+    print("Performance results for dynamic batching stored in: {0}".format(result_path))
+
+    print("\n")
+    print(f"==== Analysis done ====")
+    print("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, required=True, help="Name of the model to test")
+    parser.add_argument(
+        "--input-data", type=str, required=False, default="random", help="Input data to perform profiling."
+    )
+    parser.add_argument(
+        "--input-shape",
+        action="append",
+        required=False,
+        help="Input data shape in form INPUT_NAME:<full_shape_without_batch_axis>.",
+    )
+    parser.add_argument("--batch-sizes", type=str, required=True, help="List of batch sizes to tests. Comma separated.")
+    parser.add_argument("--triton-instances", type=int, default=1, help="Number of Triton Server instances")
+    parser.add_argument(
+        "--number-of-model-instances", type=int, default=1, help="Number of models instances on Triton Server"
+    )
+    parser.add_argument("--result-path", type=str, required=True, help="Path where result file is going to be stored.")
+    parser.add_argument("--server-url", type=str, required=False, default="localhost", help="Url to Triton server")
+    parser.add_argument(
+        "--measurement-window", required=False, help="Time which perf_analyzer will wait for results", default=10000
+    )
+    parser.add_argument("--shared-memory", help="Use shared memory for communication with Triton", action="store_true",
+                        default=False)
+
+    args = parser.parse_args()
+
+    warmup(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        batch_sizes=_parse_batch_sizes(args.batch_sizes),
+        triton_instances=args.triton_instances,
+        triton_gpu_engine_count=args.number_of_model_instances,
+        profiling_data=args.input_data,
+        input_shapes=args.input_shape,
+        measurement_window=args.measurement_window,
+        shared_memory=args.shared_memory
+    )
+
+    online_performance(
+        server_url=args.server_url,
+        model_name=args.model_name,
+        batch_sizes=_parse_batch_sizes(args.batch_sizes),
+        triton_instances=args.triton_instances,
+        triton_gpu_engine_count=args.number_of_model_instances,
+        profiling_data=args.input_data,
+        input_shapes=args.input_shape,
+        result_path=args.result_path,
+        measurement_window=args.measurement_window,
+        shared_memory=args.shared_memory
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/triton/scripts/benchmark.sh
+++ b/triton/scripts/benchmark.sh
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_REPO=${1:-"/repo"}
+OUTPUT=${2:-"/logs"}
+MODEL_ARCH=${3:-"resnet50"}
+MODEL_CHECKPOINT=${4:-"/checkpoint.pth"}
+
+for backend in ts onnx trt; do
+    if [[ "$backend" = "ts" ]]; then
+        EXPORT_NAME="ts-script"
+    else
+        EXPORT_NAME="${backend}"
+    fi
+
+    for precision in 16 32; do
+        if [[ $precision -eq 16 ]]; then
+            CUSTOM_FLAGS="--fp16"
+            CUSTON_TRTFLAGS="--trt-fp16 --max_workspace_size 2147483648"
+        else
+            CUSTOM_FLAGS=""
+            CUSTON_TRTFLAGS=""
+        fi
+
+        echo "Exporting model as ${EXPORT_NAME} with precision ${precision}"
+
+        python -m triton.deployer --${EXPORT_NAME} --triton-model-name model_${backend} --triton-max-batch-size 64 \
+            --triton-engine-count 2 --save-dir ${MODEL_REPO} ${CUSTON_TRTFLAGS} -- --config ${MODEL_ARCH} ${CUSTOM_FLAGS}
+        sleep 30
+
+        /workspace/bin/perf_client --max-threads 10 -m model_${backend} -x 1 -p 10000 -v -i gRPC -u localhost:8001 -b 1 \
+            -l 5000 --concurrency-range 1:2 -f ${OUTPUT}/${backend}_dynamic_${precision}.csv
+        for CONCURENCY_LEVEL in 4 8 16 32 64 128 256; do
+            /workspace/bin/perf_client --max-threads 10 -m model_${backend} -x 1 -p 10000 -v -i gRPC -u localhost:8001 -b 1 \
+                -l 5000 --concurrency-range $CONCURENCY_LEVEL:$CONCURENCY_LEVEL -f >(tail -n +2 >> ${OUTPUT}/${backend}_dynamic_${precision}.csv)
+        done
+        rm -rf ${MODEL_REPO}/model_${backend}
+    done
+    cat ${OUTPUT}/*_dynamic_*.csv
+done
--- a/triton/scripts/docker/build.sh
+++ b/triton/scripts/docker/build.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker build -t resnet50 . -f triton/resnet50/Dockerfile
--- a/triton/scripts/docker/interactive.sh
+++ b/triton/scripts/docker/interactive.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker run -it --rm \
+  --gpus "device=all" \
+  --net=host \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -e WORKDIR=$(pwd) \
+  -e PYTHONPATH=$(pwd) \
+  -v $(pwd):$(pwd) \
+  -w $(pwd) \
+  resnet50:latest bash
--- a/triton/scripts/docker/triton_inference_server.sh
+++ b/triton/scripts/docker/triton_inference_server.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:=all}
+
+docker run --rm -d \
+  -p 8000:8000 \
+  -p 8001:8001 \
+  -p 8002:8002 \
+  --runtime=nvidia \
+  -e NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES} \
+  -e ORT_TENSORRT_FP16_ENABLE=1 \
+  -v ${MODEL_REPOSITORY_PATH}:${MODEL_REPOSITORY_PATH} \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  nvcr.io/nvidia/tritonserver:21.02-py3 tritonserver \
+  --model-store=${MODEL_REPOSITORY_PATH} \
+  --strict-model-config=false \
+  --exit-on-error=true \
+  --model-control-mode=explicit
--- a/triton/scripts/download_data.sh
+++ b/triton/scripts/download_data.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Download checkpoint
+if [ -f "${CHECKPOINT_DIR}/nvidia_resnet50_200821.pth.tar" ]; then
+  echo "Checkpoint already downloaded."
+else
+  echo "Downloading checkpoint ..."
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/resnet50_pyt_amp/versions/20.06.0/zip -O \
+    resnet50_pyt_amp_20.06.0.zip || {
+    echo "ERROR: Failed to download checkpoint from NGC"
+    exit 1
+  }
+  unzip resnet50_pyt_amp_20.06.0.zip -d ${CHECKPOINT_DIR}
+  rm resnet50_pyt_amp_20.06.0.zip
+  echo "ok"
+fi
--- a/triton/scripts/get_metrics_static.sh
+++ b/triton/scripts/get_metrics_static.sh
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MODEL_NAME=${1}
+OUTPUT_FILE=${2:-result.csv}
+
+for i in 1 2 4 8 16 32 64 128; do
+    echo "Model $MODEL_NAME evaluation with BS $i"
+    /workspace/bin/perf_client --max-threads 10 -m $MODEL_NAME -x 1 -p 10000 -v -i gRPC -u localhost:8001 -b $i -l 5000 \
+        --concurrency-range 1 -f >(tail -n +2 | sed -e 's/^/BS='${i}',/' >> $OUTPUT_FILE)
+done
\ No newline at end of file
--- a/triton/scripts/process_dataset.sh
+++ b/triton/scripts/process_dataset.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ -d "${DATASETS_DIR}/imagenet" ]; then
+  echo "Dataset already downloaded and processed."
+else
+  python triton/process_dataset.py
+fi
\ No newline at end of file
--- a/triton/scripts/process_output.sh
+++ b/triton/scripts/process_output.sh
+#!/bin/bash
+
+echo "Processing file $1"
+echo "Throughput:"
+
+cat $1 | cut -d ',' -f3
+
+echo ""
+echo "Average latency: "
+cat $1 | cut -d ',' -f4-10 | sed "s/,/\+/g" | sed "s/.*/scale=2; (\0) \/ 1000/g" | bc
+
+echo ""
+echo "p90 latency: "
+cat $1 | cut -d ',' -f12 | sed "s/.*/scale=2; \0 \/ 1000/g" | bc
+echo ""
+echo "p95 latency: "
+cat $1 | cut -d ',' -f13 | sed "s/.*/scale=2; \0 \/ 1000/g" | bc
+echo ""
+echo "p99 latency: "
+cat $1 | cut -d ',' -f14 | sed "s/.*/scale=2; \0 \/ 1000/g" | bc
\ No newline at end of file
--- a/triton/scripts/setup_environment.sh
+++ b/triton/scripts/setup_environment.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+WORKDIR="${WORKDIR:=$(pwd)}"
+export WORKSPACE_DIR=${WORKDIR}/workspace
+export DATASETS_DIR=${WORKSPACE_DIR}/datasets_dir
+export CHECKPOINT_DIR=${WORKSPACE_DIR}/checkpoint_dir
+export MODEL_REPOSITORY_PATH=${WORKSPACE_DIR}/model_store
+export SHARED_DIR=${WORKSPACE_DIR}/shared_dir
+
+echo "Preparing directories"
+mkdir -p ${WORKSPACE_DIR}
+mkdir -p ${DATASETS_DIR}
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${MODEL_REPOSITORY_PATH}
+mkdir -p ${SHARED_DIR}
+
+echo "Setting up environment"
+export MODEL_NAME=resnet50
+export TRITON_LOAD_MODEL_METHOD=explicit
+export TRITON_INSTANCES=1
--- a/triton/scripts/setup_parameters.sh
+++ b/triton/scripts/setup_parameters.sh
+#!/usr/bin/env bash
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export PRECISION="fp16"
+export FORMAT="onnx"
+export BATCH_SIZE="1,2,4,8,16,32,64,128"
+export BACKEND_ACCELERATOR="trt"
+export MAX_BATCH_SIZE="128"
+export NUMBER_OF_MODEL_INSTANCES="1"
+export TRITON_MAX_QUEUE_DELAY="1"
+export TRITON_PREFERRED_BATCH_SIZES="64 128"
+
--- a/triton/se-resnext101-32x4d/Latency-vs-Throughput-TensorRT.png
+++ b/triton/se-resnext101-32x4d/Latency-vs-Throughput-TensorRT.png
--- a/triton/se-resnext101-32x4d/Performance-analysis-TensorRT-FP16.png
+++ b/triton/se-resnext101-32x4d/Performance-analysis-TensorRT-FP16.png
--- a/triton/se-resnext101-32x4d/Performance-analysis-TensorRT-FP32.png
+++ b/triton/se-resnext101-32x4d/Performance-analysis-TensorRT-FP32.png
--- a/triton/se-resnext101-32x4d/README.md
+++ b/triton/se-resnext101-32x4d/README.md
+# Deploying the SE-ResNeXt101-32x4d model using Triton Inference Server
+
+The [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server. 
+
+This folder contains instructions on how to deploy and run inference on
+Triton Inference Server as well as gather detailed performance analysis.
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+* [Setup](#setup)
+  * [Inference container](#inference-container)
+  * [Deploying the model](#deploying-the-model)
+  * [Running the Triton Inference Server](#running-the-triton-inference-server)
+* [Quick Start Guide](#quick-start-guide)
+  * [Running the client](#running-the-client)
+  * [Gathering performance data](#gathering-performance-data)
+* [Advanced](#advanced)
+  * [Automated benchmark script](#automated-benchmark-script)
+* [Performance](#performance)
+  * [Dynamic batching performance](#dynamic-batching-performance)
+  * [TensorRT backend inference performance (1x V100 16GB)](#tensorrt-backend-inference-performance-1x-v100-16gb)
+* [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Model overview
+The SE-ResNeXt101-32x4d is a [ResNeXt101-32x4d](https://arxiv.org/pdf/1611.05431.pdf)
+model with added Squeeze-and-Excitation module introduced
+in [Squeeze-and-Excitation Networks](https://arxiv.org/pdf/1709.01507.pdf) paper.
+
+The SE-ResNeXt101-32x4d model can be deployed for inference on the [NVIDIA Triton Inference Server](https://github.com/NVIDIA/triton-inference-server) using
+TorchScript, ONNX Runtime or TensorRT as an execution backend.
+
+## Setup
+
+This script requires trained SE-ResNeXt101-32x4d model checkpoint that can be used for deployment. 
+
+### Inference container
+
+For easy-to-use deployment, a build script for special inference container was prepared. To build that container, go to the main repository folder and run:
+
+`docker build -t sernxt_inference . -f triton/Dockerfile`
+
+This command will download the dependencies and build the inference containers. Then, run shell inside the container:
+
+`docker run -it --rm --gpus device=0 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --net=host -v <PATH_TO_MODEL_REPOSITORY>:/repository sernxt_inference bash`
+
+Here `device=0,1,2,3` selects the GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates location to where the
+deployed models were stored.
+
+### Deploying the model
+
+To deploy the SE-ResNext101-32x4d model into the Triton Inference Server, you must run the `deployer.py` script from inside the deployment Docker container to achieve a compatible format. 
+
+```
+usage: deployer.py [-h] (--ts-script | --ts-trace | --onnx | --trt)
+                   [--triton-no-cuda] [--triton-model-name TRITON_MODEL_NAME]
+                   [--triton-model-version TRITON_MODEL_VERSION]
+                   [--triton-server-url TRITON_SERVER_URL]
+                   [--triton-max-batch-size TRITON_MAX_BATCH_SIZE]
+                   [--triton-dyn-batching-delay TRITON_DYN_BATCHING_DELAY]
+                   [--triton-engine-count TRITON_ENGINE_COUNT]
+                   [--save-dir SAVE_DIR]
+                   [--max_workspace_size MAX_WORKSPACE_SIZE] [--trt-fp16]
+                   [--capture-cuda-graph CAPTURE_CUDA_GRAPH]
+                   ...
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --ts-script           convert to torchscript using torch.jit.script
+  --ts-trace            convert to torchscript using torch.jit.trace
+  --onnx                convert to onnx using torch.onnx.export
+  --trt                 convert to trt using tensorrt
+
+triton related flags:
+  --triton-no-cuda      Use the CPU for tracing.
+  --triton-model-name TRITON_MODEL_NAME
+                        exports to appropriate directory structure for TRITON
+  --triton-model-version TRITON_MODEL_VERSION
+                        exports to appropriate directory structure for TRITON
+  --triton-server-url TRITON_SERVER_URL
+                        exports to appropriate directory structure for TRITON
+  --triton-max-batch-size TRITON_MAX_BATCH_SIZE
+                        Specifies the 'max_batch_size' in the TRITON model
+                        config. See the TRITON documentation for more info.
+  --triton-dyn-batching-delay TRITON_DYN_BATCHING_DELAY
+                        Determines the dynamic_batching queue delay in
+                        milliseconds(ms) for the TRITON model config. Use '0'
+                        or '-1' to specify static batching. See the TRITON
+                        documentation for more info.
+  --triton-engine-count TRITON_ENGINE_COUNT
+                        Specifies the 'instance_group' count value in the
+                        TRITON model config. See the TRITON documentation for
+                        more info.
+  --save-dir SAVE_DIR   Saved model directory
+
+optimization flags:
+  --max_workspace_size MAX_WORKSPACE_SIZE
+                        set the size of the workspace for trt export
+  --trt-fp16            trt flag ---- export model in mixed precision mode
+  --capture-cuda-graph CAPTURE_CUDA_GRAPH
+                        capture cuda graph for obtaining speedup. possible
+                        values: 0, 1. default: 1.
+  model_arguments       arguments that will be ignored by deployer lib and
+                        will be forwarded to your deployer script
+```
+
+Following model specific arguments have to be specified for model deployment:
+  
+```
+  --config CONFIG        Network architecture to use for deployment (eg. resnet50, 
+                         resnext101-32x4d or se-resnext101-32x4d)
+  --checkpoint CHECKPOINT
+                         Path to stored model weight. If not specified, model will be 
+                         randomly initialized
+  --batch_size BATCH_SIZE
+                         Batch size used for dummy dataloader
+  --fp16                 Use model with half-precision calculations
+```
+
+For example, to deploy model into TensorRT format, using half precision and max batch size 64 called
+`sernxt-trt-16` execute:
+
+`python -m triton.deployer --trt --trt-fp16 --triton-model-name sernxt-trt-16 --triton-max-batch-size 64 --save-dir /repository -- --config se-resnext101-32x4d --checkpoint model_checkpoint --batch_size 64 --fp16`
+
+Where `model_checkpoint` is a checkpoint for a trained model with the same architecture (se-resnext101-32x4d) as used during export.
+
+### Running the Triton Inference Server
+
+**NOTE: This step is executed outside the inference container.**
+
+Pull the Triton Inference Server container from our repository:
+
+`docker pull nvcr.io/nvidia/tritonserver:20.07-py3`
+
+Run the command to start the Triton Inference Server:
+
+`docker run -d --rm --gpus device=0 --ipc=host --network=host -p 8000:8000 -p 8001:8001 -p 8002:8002 -v <PATH_TO_MODEL_REPOSITORY>:/models nvcr.io/nvidia/tritonserver:20.07-py3 trtserver --model-store=/models --log-verbose=1 --model-control-mode=poll --repository-poll-secs=5`
+
+Here `device=0,1,2,3` selects GPUs indexed by ordinals `0,1,2` and `3`, respectively. The server will see only these GPUs. If you write `device=all`, then the server will see all the available GPUs. `PATH_TO_MODEL_REPOSITORY` indicates the location where the 
+deployed models were stored. An additional `--model-controle-mode` option allows to reload the model when it changes in the filesystem. It is a required option for benchmark scripts that works with multiple model versions on a single Triton Inference Server instance.
+
+## Quick Start Guide
+
+### Running the client
+
+The client `client.py` checks the model accuracy against synthetic or real validation
+data. The client connects to Triton Inference Server and performs inference. 
+
+```
+usage: client.py [-h] --triton-server-url TRITON_SERVER_URL
+                 --triton-model-name TRITON_MODEL_NAME [-v]
+                 [--inference_data INFERENCE_DATA] [--batch_size BATCH_SIZE]
+                 [--fp16]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --triton-server-url TRITON_SERVER_URL
+                        URL adress of trtion server (with port)
+  --triton-model-name TRITON_MODEL_NAME
+                        Triton deployed model name
+  -v, --verbose         Verbose mode.
+  --inference_data INFERENCE_DATA
+                        Path to file with inference data.
+  --batch_size BATCH_SIZE
+                        Inference request batch size
+  --fp16                Use fp16 precision for input data
+
+```
+
+To run inference on the model exported in the previous steps, using the data located under
+`/dataset`, run:
+
+`python -m triton.client --triton-server-url localhost:8001 --triton-model-name sernxt-trt-16 --inference_data /data/test_data.bin --batch_size 16 --fp16`
+
+
+### Gathering performance data
+Performance data can be gathered using the `perf_client` tool. To use this tool to measure performance for batch_size=32, the following command can be used:
+
+`/workspace/bin/perf_client --max-threads 10 -m sernxt-trt-16 -x 1 -p 10000 -v -i gRPC -u localhost:8001 -b 32 -l 5000 --concurrency-range 1 -f result.csv`
+
+For more information about `perf_client`, refer to the [documentation](https://docs.nvidia.com/deeplearning/sdk/triton-inference-server-master-branch-guide/docs/optimization.html#perf-client).
+
+## Advanced
+
+### Automated benchmark script
+To automate benchmarks of different model configurations, a special benchmark script is located in `triton/scripts/benchmark.sh`. To use this script,
+run Triton Inference Server and then execute the script as follows:
+
+`bash triton/scripts/benchmark.sh <MODEL_REPOSITORY> <LOG_DIRECTORY> <ARCHITECTURE> (<CHECKPOINT_PATH>)`
+
+The benchmark script tests all supported backends with different batch sizes and server configuration. Logs from execution will be stored in `<LOG DIRECTORY>`.
+To process static configuration logs, `triton/scripts/process_output.sh` script can be used.
+
+## Performance
+
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
+
+### Dynamic batching performance
+The Triton Inference Server has a dynamic batching mechanism built-in that can be enabled. When it is enabled, the server creates inference batches from multiple received requests. This allows us to achieve better performance than doing inference on each single request. The single request is assumed to be a single image that needs to be inferenced. With dynamic batching enabled, the server will concatenate single image requests into an inference batch. The upper bound of the size of the inference batch is set to 64. All these parameters are configurable.
+
+Our results were obtained by running automated benchmark script. 
+Throughput is measured in images/second, and latency in milliseconds.
+
+### TensorRT backend inference performance (1x V100 16GB)
+**FP32 Inference Performance**
+
+|**Concurrent requests**|**Throughput (img/s)**|**Avg. Latency (ms)**|**90% Latency (ms)**|**95% Latency (ms)**|**99% Latency (ms)**|
+|-----|--------|-------|--------|-------|-------|
+|1 | 62.1 | 16.10 | 16.20 | 16.23 | 16.33|
+|2 | 66.2 | 30.23 | 30.26 | 30.27 | 30.32|
+|4 | 124.6 | 32.13 | 32.19 | 32.21 | 32.28|
+|8 | 151.1 | 52.91 | 53.10 | 53.15 | 53.21|
+|16 | 240 | 66.51 | 66.82 | 66.91 | 67.05|
+|32 | 326.8 | 98.00 | 132.41 | 134.00 | 137.71|
+|64 | 412.6 | 154.74 | 182.47 | 185.90 | 195.43|
+|128 | 506.7 | 252.58 | 275.03 | 277.56 | 279.86|
+|256 | 588.8 | 434.40 | 435.82 | 436.59 | 444.09|
+
+
+**FP16 Inference Performance**
+
+|**Concurrent requests**|**Throughput (img/s)**|**Avg. Latency (ms)**|**90% Latency (ms)**|**95% Latency (ms)**|**99% Latency (ms)**|
+|-----|--------|-------|--------|-------|-------|
+|1 | 77.5 | 12.90 | 12.98 | 13.01 | 13.05|
+|2 | 82.8 | 24.15 | 24.23 | 24.25 | 24.30|
+|4 | 128.8 | 31.06 | 38.81 | 39.15 | 39.31|
+|8 | 212 | 37.68 | 42.28 | 43.06 | 43.17|
+|16 | 351.3 | 45.52 | 48.41 | 48.52 | 48.92|
+|32 | 548 | 58.38 | 59.09 | 59.38 | 59.80|
+|64 | 774 | 82.63 | 84.40 | 84.88 | 86.49|
+|128 | 985.7 | 130.30 | 130.83 | 131.26 | 132.86|
+|256 | 1132.8 | 225.56 | 226.34 | 227.31 | 229.30 |
+
+![Latency vs Througput](./Latency-vs-Throughput-TensorRT.png)
+
+![Performance analysis - TensorRT FP32](./Performance-analysis-TensorRT-FP32.png)
+
+![Performance analysis - TensorRT FP16](./Performance-analysis-TensorRT-FP16.png)
+
+
+## Release notes
+
+### Changelog
+September 2020
+- Initial release
\ No newline at end of file