初始化仓库

e5ca7e62 · hepj987 · e5ca7e62 · e5ca7e62 · e5ca7e62 · e5ca7e62
Commit e5ca7e62 authored Jul 17, 2023 by hepj987
15 changed files
--- a/triton/client.py
+++ b/triton/client.py
+#!/usr/bin/python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import json
+import argparse
+import numpy as np
+from builtins import range
+from tensorrtserver.api import *
+# 
+import sys
+sys.path.append('../')
+from inference import preprocess_tokenized_text, get_answer
+from tokenization import BertTokenizer
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument('--batch-size', type=int, default=1, 
+                        help='batch size for inference. default: 1')
+    parser.add_argument("--triton-model-name", type=str, default="model_name", 
+                        help="the name of the model used for inference")
+    parser.add_argument("--triton-model-version", type=int, default=-1, 
+                        help="the version of the model used for inference")
+    parser.add_argument("--triton-server-url", type=str, default="localhost:8000", 
+                        help="Inference server URL. Default is localhost:8000.")
+    parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False,
+                        help='Enable verbose output')
+    parser.add_argument('--protocol', type=str, required=False, default='http',
+                        help='Protocol ("http"/"grpc") used to ' +
+                        'communicate with inference service. Default is "http".')
+    parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER",
+                        required=False, action='append',
+                        help='HTTP headers to add to inference server requests. ' +
+                        'Format is -H"Header:Value".')
+    
+    ## pre- and postprocessing parameters
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. ")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--n_best_size", default=1, type=int,
+                        help="The total number of n-best predictions to generate. ")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument('--version_2_with_negative', 
+                        action='store_true',
+                        help='If true, then the model can reply with "unknown". ')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=-11.0,
+                        help="If null_score - best_non_null is greater than the threshold predict 'unknown'. ")
+    parser.add_argument('--vocab_file',
+                        type=str, default=None, required=True,
+                        help="Vocabulary mapping/file BERT was pretrainined on")
+    # input texts
+    parser.add_argument("--question", default="Most antibiotics target bacteria and don't affect what class of organisms? ", 
+                                              type=str, help="question")
+    parser.add_argument("--context", default="Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ", 
+                                              type=str, help="context")
+    
+    args = parser.parse_args()
+    args.protocol = ProtocolType.from_str(args.protocol)
+    
+    # Create a health context, get the ready and live state of server.
+    health_ctx = ServerHealthContext(args.triton_server_url, args.protocol, 
+                                     http_headers=args.http_headers, verbose=args.verbose)
+    print("Health for model {}".format(args.triton_model_name))
+    print("Live: {}".format(health_ctx.is_live()))
+    print("Ready: {}".format(health_ctx.is_ready()))
+    
+    # Create a status context and get server status
+    status_ctx = ServerStatusContext(args.triton_server_url, args.protocol, args.triton_model_name, 
+                                     http_headers=args.http_headers, verbose=args.verbose)
+    print("Status for model {}".format(args.triton_model_name))
+    print(status_ctx.get_server_status())
+    
+    # Create the inference context for the model.
+    infer_ctx = InferContext(args.triton_server_url, args.protocol, args.triton_model_name, args.triton_model_version, 
+                             http_headers=args.http_headers, verbose=args.verbose)
+    
+    print("question: ", args.question)
+    print("context: ", args.context)
+    print()
+    
+    # pre-processing
+    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
+    
+    doc_tokens = args.context.split()
+    query_tokens = tokenizer.tokenize(args.question)
+    feature = preprocess_tokenized_text(doc_tokens, 
+                                        query_tokens, 
+                                        tokenizer, 
+                                        max_seq_length=args.max_seq_length, 
+                                        max_query_length=args.max_query_length)
+    
+    tensors_for_inference, tokens_for_postprocessing = feature
+    
+    dtype = np.int64
+    input_ids = np.array(tensors_for_inference.input_ids, dtype=dtype)[None,...] # make bs=1
+    segment_ids = np.array(tensors_for_inference.segment_ids, dtype=dtype)[None,...] # make bs=1
+    input_mask = np.array(tensors_for_inference.input_mask, dtype=dtype)[None,...] # make bs=1
+    
+    assert args.batch_size == input_ids.shape[0]
+    assert args.batch_size == segment_ids.shape[0]
+    assert args.batch_size == input_mask.shape[0]
+    
+    # prepare inputs
+    input_dict = {
+                           "input__0" : tuple(input_ids[i] for i in range(args.batch_size)), 
+                           "input__1" : tuple(segment_ids[i] for i in range(args.batch_size)), 
+                           "input__2" : tuple(input_mask[i] for i in range(args.batch_size))
+    }
+    
+    # prepare outputs
+    output_keys = [
+                           "output__0",
+                           "output__1"
+    ]
+    
+    output_dict = {}
+    for k in output_keys:
+        output_dict[k] = InferContext.ResultFormat.RAW
+    
+    # Send inference request to the inference server. 
+    result = infer_ctx.run(input_dict, output_dict, args.batch_size)
+    
+    # get the result
+    start_logits = result["output__0"][0].tolist()
+    end_logits = result["output__1"][0].tolist()
+    
+    # post-processing
+    answer, answers = get_answer(doc_tokens, tokens_for_postprocessing, 
+                                 start_logits, end_logits, args)
+    
+    # print result
+    print()
+    print(answer)
+    print()
+    print(json.dumps(answers, indent=4))
+
--- a/triton/deployer.py
+++ b/triton/deployer.py
+#!/usr/bin/python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+import torch
+import pickle
+import argparse
+import deployer_lib
+# 
+import sys
+sys.path.append('../')
+sys.path.append('.')
+from modeling import BertForQuestionAnswering, BertConfig
+from tokenization import BertTokenizer
+from run_squad import convert_examples_to_features, read_squad_examples
+
+
+def get_model_args(model_args):
+    ''' the arguments initialize_model will receive '''
+    parser = argparse.ArgumentParser()
+    ## Required parameters by the model. 
+    parser.add_argument("--checkpoint", 
+                        default=None, 
+                        type=str, 
+                        required=True, 
+                        help="The checkpoint of the model. ")
+    parser.add_argument('--batch_size', 
+                        default=8, 
+                        type=int, 
+                        help='Batch size for inference')
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, 
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                             "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--do_lower_case", 
+                        action='store_true', 
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument('--vocab_file', 
+                        type=str, default=None, required=True, 
+                        help="Vocabulary mapping/file BERT was pretrainined on")
+    parser.add_argument("--predict_file", default=None, type=str, 
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument('--version_2_with_negative', 
+                        action='store_true', 
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument("--max_seq_length", default=384, type=int, 
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int, 
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int, 
+                        help="The maximum number of tokens for the question. Questions longer than this will " 
+                             "be truncated to this length.")
+    parser.add_argument("--config_file", 
+                        default=None, 
+                        type=str, 
+                        required=True, 
+                        help="The BERT model config")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="use mixed-precision")
+    parser.add_argument('--nbatches', 
+                        default=2, 
+                        type=int, 
+                        help='Number of batches in the inference dataloader. Default: 10. ')
+    return parser.parse_args(model_args)
+
+
+def initialize_model(args):
+    ''' return model, ready to trace '''
+    config = BertConfig.from_json_file(args.config_file)
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+    model = BertForQuestionAnswering(config)
+    model.enable_apex(False)
+    state_dict = torch.load(args.checkpoint, map_location='cpu')["model"]
+    model.load_state_dict(state_dict)
+    if args.fp16:
+        model.half()
+    return model
+
+
+def get_dataloader(args):
+    ''' return dataloader for inference '''
+    
+    # Preprocess input data
+    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
+    
+    cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
+    try:
+        with open(cached_features_file, "rb") as reader:
+            eval_features = pickle.load(reader)
+    except:
+        eval_examples = read_squad_examples(
+            input_file=args.predict_file,
+            is_training=False,
+            version_2_with_negative=args.version_2_with_negative)
+        eval_features = convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=False)
+        with open(cached_features_file, "wb") as writer:
+            pickle.dump(eval_features, writer)
+    
+    data = []
+    for feature in eval_features:
+        input_ids = torch.tensor(feature.input_ids, dtype=torch.int64)
+        input_mask = torch.tensor(feature.input_mask, dtype=torch.int64)
+        segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64)
+        inp = (input_ids, segment_ids, input_mask)
+        data.append(inp)
+    
+    if args.nbatches > 0:
+        data = data[:args.nbatches*args.batch_size]
+    
+    test_loader = torch.utils.data.DataLoader(
+        data, 
+        batch_size=args.batch_size, 
+        shuffle=False, 
+        num_workers=1, 
+        pin_memory=True)
+    
+    return test_loader
+
+
+if __name__=='__main__':
+    # don't touch this!
+    deployer, model_argv = deployer_lib.create_deployer(sys.argv[1:]) # deployer and returns removed deployer arguments
+    
+    model_args = get_model_args(model_argv)
+    
+    model = initialize_model(model_args)
+    dataloader = get_dataloader(model_args)
+    
+    deployer.deploy(dataloader, model)
+
--- a/triton/deployer_lib.py
+++ b/triton/deployer_lib.py
+#!/usr/bin/python
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+
+import os
+import sys
+import time
+import json
+import torch
+import argparse
+import statistics
+from collections import Counter
+
+
+torch_type_to_triton_type = {
+    torch.bool:     'TYPE_BOOL', 
+    torch.int8:     'TYPE_INT8', 
+    torch.int16:    'TYPE_INT16', 
+    torch.int32:    'TYPE_INT32', 
+    torch.int64:    'TYPE_INT64', 
+    torch.uint8:    'TYPE_UINT8', 
+    torch.float16:  'TYPE_FP16', 
+    torch.float32:  'TYPE_FP32', 
+    torch.float64:  'TYPE_FP64'
+}
+
+
+CONFIG_TEMPLATE = r"""
+name: "{model_name}"
+platform: "{platform}"
+max_batch_size: {max_batch_size}
+input [
+    {spec_inputs}
+]
+output [
+    {spec_outputs}
+]
+{dynamic_batching}
+{model_optimizations}
+instance_group [
+    {{
+        count: {engine_count}
+        kind: KIND_GPU
+        gpus: [ {gpu_list} ]
+    }}
+]"""
+
+
+INPUT_TEMPLATE = r"""
+{{
+    name: "input__{num}"
+    data_type: {type}
+    dims: {dims}
+    {reshape}
+}},"""
+
+
+OUTPUT_TEMPLATE = r""" 
+{{
+    name: "output__{num}"
+    data_type: {type}
+    dims: {dims}
+    {reshape}
+}},"""
+
+
+MODEL_OPTIMIZATION_TEMPLATE = r"""
+optimization {{
+  cuda {{
+    graphs: {capture_cuda_graph}
+  }}
+}}"""
+
+
+def remove_empty_lines(text):
+    ''' removes empty lines from text, returns the result '''
+    ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
+    return ret
+
+
+def create_deployer(argv):
+    ''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
+    parser = argparse.ArgumentParser()
+    # required args
+    method = parser.add_mutually_exclusive_group(required=True)
+    method.add_argument('--ts-script',
+                        action='store_true',
+                        help='convert to torchscript using torch.jit.script')
+    method.add_argument('--ts-trace',
+                        action='store_true',
+                        help='convert to torchscript using torch.jit.trace')
+    method.add_argument('--onnx',
+                        action='store_true',
+                        help='convert to onnx using torch.onnx.export')
+    # triton related args
+    arguments = parser.add_argument_group('triton related flags')
+    arguments.add_argument('--triton-no-cuda',
+                            action='store_true',
+                            help='Use the CPU for tracing.')
+    arguments.add_argument('--triton-model-name',
+                            type=str,
+                            default="model",
+                            help="exports to appropriate directory structure for Triton")
+    arguments.add_argument("--triton-model-version",
+                            type=int,
+                            default=1,
+                            help="exports to appropriate directory structure for Triton")
+    arguments.add_argument("--triton-server-url",
+                            type=str,
+                            default="localhost:8001",
+                            help="exports to appropriate directory structure for Triton")
+    arguments.add_argument("--triton-max-batch-size",
+                            type=int,
+                            default=8,
+                            help="Specifies the 'max_batch_size' in the Triton model config.\
+                                  See the Triton documentation for more info.")
+    arguments.add_argument("--triton-dyn-batching-delay",
+                            type=float,
+                            default=0,
+                            help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
+                                  the Triton model config. Use '0' or '-1' to specify static batching.\
+                                  See the Triton documentation for more info.")
+    arguments.add_argument("--triton-engine-count",
+                            type=int,
+                            default=1,
+                            help="Specifies the 'instance_group' count value in the Triton model config.\
+                                  See the Triton documentation for more info.")
+    arguments.add_argument('--save-dir', type=str, default='./triton_models', help='Saved model directory')
+    # optimization args
+    arguments = parser.add_argument_group('optimization flags')
+    arguments.add_argument("--capture-cuda-graph",
+                            type=int,
+                            default=0,
+                            help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 0 (automatic). ")
+    # remainder args
+    arguments.add_argument('model_arguments', nargs=argparse.REMAINDER, help='arguments that will be ignored by deployer lib and will be forwarded to your deployer script')
+    # 
+    args = parser.parse_args(argv)
+    deployer = Deployer(args)
+    # 
+    return deployer, args.model_arguments[1:]
+
+
+class DeployerLibrary:
+    def __init__(self, args):
+        self.args = args
+        self.platform = None
+    
+    def set_platform(self, platform):
+        ''' sets the platform
+            :: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
+        '''
+        self.platform = platform
+    
+    def prepare_inputs(self, dataloader, device):
+        ''' load sample inputs to device '''
+        inputs = []
+        for batch in dataloader:
+            if type(batch) is torch.Tensor:
+                batch_d = batch.to(device)
+                batch_d = (batch_d,)
+                inputs.append(batch_d)
+            else:
+                batch_d = []
+                for x in batch:
+                    assert type(x) is torch.Tensor, "input is not a tensor"
+                    batch_d.append(x.to(device))
+                batch_d = tuple(batch_d)
+                inputs.append(batch_d)
+        return inputs
+    
+    def get_list_of_shapes(self, l, fun):
+        ''' returns the list of min/max shapes, depending on fun
+            :: l :: list of tuples of tensors
+            :: fun :: min or max
+        '''
+        tensor_tuple = l[0]
+        shapes = [list(x.shape) for x in tensor_tuple]
+        for tensor_tuple in l:
+            assert len(tensor_tuple) == len(shapes), "tensors with varying shape lengths are not supported"
+            for i,x in enumerate(tensor_tuple):
+                for j in range(len(x.shape)):
+                    shapes[i][j] = fun(shapes[i][j], x.shape[j])
+        return shapes # a list of shapes
+    
+    def get_tuple_of_min_shapes(self, l):
+        ''' returns the tuple of min shapes 
+            :: l :: list of tuples of tensors '''
+        shapes = self.get_list_of_shapes(l, min)
+        min_batch = 1
+        shapes = [[min_batch,*shape[1:]] for shape in shapes]
+        shapes = tuple(shapes)
+        return shapes # tuple of min shapes
+    
+    def get_tuple_of_max_shapes(self, l):
+        ''' returns the tuple of max shapes 
+            :: l :: list of tuples of tensors '''
+        shapes = self.get_list_of_shapes(l, max)
+        max_batch = max(2,shapes[0][0])
+        shapes = [[max_batch,*shape[1:]] for shape in shapes]
+        shapes = tuple(shapes)
+        return shapes # tuple of max shapes
+    
+    def get_tuple_of_opt_shapes(self, l):
+        ''' returns the tuple of opt shapes 
+            :: l :: list of tuples of tensors '''
+        counter = Counter()
+        for tensor_tuple in l:
+            shapes = [tuple(x.shape) for x in tensor_tuple]
+            shapes = tuple(shapes)
+            counter[shapes] += 1
+        shapes = counter.most_common(1)[0][0]
+        return shapes # tuple of most common occuring shapes
+    
+    def get_tuple_of_dynamic_shapes(self, l):
+        ''' returns a tuple of dynamic shapes: variable tensor dimensions 
+            (for ex. batch size) occur as -1 in the tuple
+            :: l :: list of tuples of tensors '''
+        tensor_tuple = l[0]
+        shapes = [list(x.shape) for x in tensor_tuple]
+        for tensor_tuple in l:
+            err_msg = "tensors with varying shape lengths are not supported"
+            assert len(tensor_tuple) == len(shapes), err_msg
+            for i,x in enumerate(tensor_tuple):
+                for j in range(len(x.shape)):
+                    if shapes[i][j] != x.shape[j] or j == 0:
+                        shapes[i][j] = -1
+        shapes = tuple(shapes)
+        return shapes # tuple of dynamic shapes
+    
+    def run_models(self, models, inputs):
+        ''' run the models on inputs, return the outputs and execution times '''
+        ret = []
+        for model in models:
+            torch.cuda.synchronize()
+            time_start = time.time()
+            outputs = []
+            for input in inputs:
+                with torch.no_grad():
+                    output = model(*input)
+                if type(output) is torch.Tensor:
+                    output = [output]
+                outputs.append(output)
+            torch.cuda.synchronize()
+            time_end = time.time()
+            t = time_end - time_start
+            ret.append(outputs)
+            ret.append(t)
+        return ret
+    
+    def compute_errors(self, outputs_A, outputs_B):
+        ''' returns the list of L_inf errors computed over every single output tensor '''
+        Linf_errors = []
+        for output_A,output_B in zip(outputs_A,outputs_B):
+            for x,y in zip(output_A, output_B):
+                error = (x - y).norm(float('inf')).item()
+                Linf_errors.append(error)
+        return Linf_errors
+    
+    def print_errors(self, Linf_errors):
+        ''' print various statistcs of Linf errors '''
+        print()
+        print("conversion correctness test results")
+        print("-----------------------------------")
+        print("maximal absolute error over dataset (L_inf): ", max(Linf_errors))
+        print()
+        print("average L_inf error over output tensors: ", statistics.mean(Linf_errors))
+        print("variance of L_inf error over output tensors: ", statistics.variance(Linf_errors))
+        print("stddev of L_inf error over output tensors: ", statistics.stdev(Linf_errors))
+        print()
+    
+    def write_config(self, config_filename, 
+                     input_shapes, input_types, 
+                     output_shapes, output_types):
+        ''' writes Triton config file 
+            :: config_filename :: the file to write the config file into
+            :: input_shapes :: tuple of dynamic shapes of the input tensors
+            :: input_types :: tuple of torch types of the input tensors
+            :: output_shapes :: tuple of dynamic shapes of the output tensors
+            :: output_types :: tuple of torch types of the output tensors
+        '''
+        assert self.platform is not None, "error - platform is not set"
+        
+        config_template = CONFIG_TEMPLATE
+        input_template = INPUT_TEMPLATE
+        optimization_template = MODEL_OPTIMIZATION_TEMPLATE
+        
+        spec_inputs = r""""""
+        for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
+            d = {
+                'num' : str(i), 
+                'type': torch_type_to_triton_type[typ],
+                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
+            }
+            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
+            spec_inputs += input_template.format_map(d)
+        spec_inputs = spec_inputs[:-1]
+        
+        output_template = OUTPUT_TEMPLATE
+        spec_outputs = r""""""
+        for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
+            d = {
+                'num' : str(i), 
+                'type': torch_type_to_triton_type[typ],
+                'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size 
+            }
+            d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
+            spec_outputs += output_template.format_map(d)
+        spec_outputs = spec_outputs[:-1]
+        
+        batching_str = ""
+        max_batch_size = self.args.triton_max_batch_size
+        
+        if (self.args.triton_dyn_batching_delay > 0):
+            # Use only full and half full batches 
+            pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
+            
+            batching_str = r"""
+dynamic_batching {{
+    preferred_batch_size: [{0}]
+    max_queue_delay_microseconds: {1}
+}}""".format(", ".join([str(x) for x in pref_batch_size]), 
+                        int(self.args.triton_dyn_batching_delay * 1000.0))
+        
+        d = {
+            "capture_cuda_graph":     str(self.args.capture_cuda_graph)
+        }
+        optimization_str = optimization_template.format_map(d)
+        
+        config_values = {
+            "model_name":           self.args.triton_model_name, 
+            "platform":             self.platform, 
+            "max_batch_size":       max_batch_size, 
+            "spec_inputs":          spec_inputs, 
+            "spec_outputs":         spec_outputs, 
+            "dynamic_batching":     batching_str, 
+            "model_optimizations" : optimization_str, 
+            "gpu_list":         ", ".join([str(x) for x in range(torch.cuda.device_count())]), 
+            "engine_count":     self.args.triton_engine_count
+        }
+        
+        # write config 
+        with open(config_filename, "w") as file:
+            final_config_str = config_template.format_map(config_values)
+            final_config_str = remove_empty_lines(final_config_str)
+            file.write(final_config_str)
+
+
+class Deployer:
+    def __init__(self, args):
+        self.args = args
+        self.lib = DeployerLibrary(args)
+    
+    def deploy(self, dataloader, model):
+        ''' deploy the model and test for correctness with dataloader '''
+        if self.args.ts_script or self.args.ts_trace:
+            self.lib.set_platform("pytorch_libtorch")
+            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
+            self.to_triton_torchscript(dataloader, model)
+        elif self.args.onnx:
+            self.lib.set_platform("onnxruntime_onnx")
+            print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
+            self.to_triton_onnx(dataloader, model)
+        else:
+            assert False, "error"
+        print("done")
+    
+    def to_triton_onnx(self, dataloader, model):
+        ''' export the model to onnx and test correctness on dataloader '''
+        import onnx
+        import onnxruntime
+        # setup device
+        if self.args.triton_no_cuda:
+            device = torch.device('cpu')
+        else:
+            device = torch.device('cuda')
+        
+        # prepare model 
+        model.to(device)
+        model.eval()
+        assert not model.training, "internal error - model should be in eval() mode! "
+        
+        # prepare inputs
+        inputs = self.lib.prepare_inputs(dataloader, device)
+        
+        # generate outputs
+        outputs = []
+        for input in inputs:
+            with torch.no_grad():
+                output = model(*input)
+            if type(output) is torch.Tensor:
+                output = [output]
+            outputs.append(output)
+        
+        # generate input shapes - dynamic tensor shape support 
+        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
+        
+        # generate output shapes - dynamic tensor shape support 
+        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
+        
+        # generate input types 
+        input_types = [x.dtype for x in inputs[0]]
+        
+        # generate output types
+        output_types = [x.dtype for x in outputs[0]]
+        
+        # get input names
+        rng = range(len(input_types))
+        input_names = ["input__" + str(num) for num in rng]
+        
+        # get output names
+        rng = range(len(output_types))
+        output_names = ["output__" + str(num) for num in rng]
+        
+        # prepare save path
+        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
+        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
+        if not os.path.exists(version_folder):
+            os.makedirs(version_folder)
+        final_model_path = os.path.join(version_folder, 'model.onnx')
+        
+        # get indices of dynamic input and output shapes
+        dynamic_axes = {}
+        for input_name,input_shape in zip(input_names,input_shapes):
+            dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
+        for output_name,output_shape in zip(output_names,output_shapes):
+            dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
+        
+        # export the model
+        assert not model.training, "internal error - model should be in eval() mode! "
+        with torch.no_grad():
+            torch.onnx.export(model, inputs[0], final_model_path, verbose=False, 
+                              input_names=input_names, output_names=output_names, 
+                              dynamic_axes=dynamic_axes, opset_version=11)
+        
+        # syntactic error check
+        converted_model = onnx.load(final_model_path)
+        # check that the IR is well formed
+        onnx.checker.check_model(converted_model)
+        
+        # load the model
+        session = onnxruntime.InferenceSession(final_model_path, None)
+        
+        class ONNX_model:
+            def __init__(self, session, input_names, device):
+                self.session = session
+                self.input_names = input_names
+                        
+            def to_numpy(self, tensor):
+                return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+            
+            def __call__(self, *inputs):
+                inp = [(input_name, inputs[i]) for i,input_name in enumerate(self.input_names)]
+                inp = {input_name : self.to_numpy(x) for input_name,x in inp}
+                outputs = self.session.run(None, inp)
+                outputs = [torch.from_numpy(output) for output in outputs]
+                outputs = [output.to(device) for output in outputs]
+                if len(outputs) == 1:
+                    outputs = outputs[0]
+                return outputs
+        
+        # switch to eval mode
+        model_onnx = ONNX_model(session, input_names, device)
+        
+        # run both models on inputs
+        assert not model.training, "internal error - model should be in eval() mode! "
+        models = (model, model_onnx)
+        outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(models, inputs)
+        
+        # check for errors
+        Linf_errors = self.lib.compute_errors(outputs, outputs_onnx)
+        self.lib.print_errors(Linf_errors)
+        print('time of error check of native model: ', time_model, 'seconds')
+        print('time of error check of onnx model: ', time_model_onnx, 'seconds')
+        print()
+        
+        # write Triton config
+        config_filename = os.path.join(model_folder, "config.pbtxt")
+        self.lib.write_config(config_filename, 
+                              input_shapes, input_types, 
+                              output_shapes, output_types)
+    
+    def to_triton_torchscript(self, dataloader, model):
+        ''' export the model to torchscript and test correctness on dataloader '''
+        # setup device
+        if self.args.triton_no_cuda:
+            device = torch.device('cpu')
+        else:
+            device = torch.device('cuda')
+        
+        # prepare model 
+        model.to(device)
+        model.eval()
+        assert not model.training, "internal error - model should be in eval() mode! "
+        
+        # prepare inputs
+        inputs = self.lib.prepare_inputs(dataloader, device)
+        
+        # generate input shapes - dynamic tensor shape support 
+        input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
+        
+        # generate input types 
+        input_types = [x.dtype for x in inputs[0]]
+        
+        # prepare save path 
+        model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
+        version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
+        if not os.path.exists(version_folder):
+            os.makedirs(version_folder)
+        final_model_path = os.path.join(version_folder, 'model.pt')
+        
+        # convert the model 
+        with torch.no_grad():
+            if self.args.ts_trace: # trace it 
+                model_ts = torch.jit.trace(model, inputs[0])
+            if self.args.ts_script: # script it 
+                model_ts = torch.jit.script(model)
+        
+        # save the model 
+        torch.jit.save(model_ts, final_model_path)
+        
+        # load the model 
+        model_ts = torch.jit.load(final_model_path)
+        model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
+        
+        # run both models on inputs
+        assert not model.training, "internal error - model should be in eval() mode! "
+        assert not model_ts.training, "internal error - converted model should be in eval() mode! "
+        models = (model, model_ts)
+        outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(models, inputs)
+        
+        # check for errors
+        Linf_errors = self.lib.compute_errors(outputs, outputs_ts)
+        self.lib.print_errors(Linf_errors)
+        print('time of error check of native model: ', time_model, 'seconds')
+        print('time of error check of ts model: ', time_model_ts, 'seconds')
+        print()
+        
+        # generate output shapes - dynamic tensor shape support 
+        output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
+        
+        # generate output types 
+        output_types = [x.dtype for x in outputs[0]]
+        
+        # now we build the config for Triton 
+        config_filename = os.path.join(model_folder, "config.pbtxt")
+        self.lib.write_config(config_filename, 
+                              input_shapes, input_types, 
+                              output_shapes, output_types)
+
--- a/triton/evaluate.sh
+++ b/triton/evaluate.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export TRITON_MODEL_OVERWRITE=True
+NV_VISIBLE_DEVICES=0
+
+bert_model=${1:-"large"}
+precision=${2:-"fp32"}
+init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
+EXPORT_FORMAT=${4:-"ts-script"}
+
+MODEL_NAME="bert_${bert_model}_${precision}"
+BERT_DIR="/workspace/bert"
+VOCAB_FILE="/workspace/bert/vocab/vocab"
+PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
+SQUAD_DIR="/workspace/bert/data/squad/v1.1"
+OUT_DIR="/results"
+BATCH_SIZE="8"
+# Create common bridge for client and server
+BRIDGE_NAME="tritonnet"
+docker network create ${BRIDGE_NAME}
+
+EXPORT_MODEL_ARGS="${BATCH_SIZE} ${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME} 0 1"
+
+# Clean up
+cleanup() {
+    docker kill trt_server_cont
+    docker network rm ${BRIDGE_NAME}
+}
+trap cleanup EXIT
+trap cleanup SIGTERM
+
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${EXPORT_MODEL_ARGS} ${TRITON_MODEL_OVERWRITE}
+
+# Start Server
+echo Starting server...
+SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
+SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
+
+./triton/wait_for_triton_server.sh
+
+CMD="python triton/run_squad_client.py \
+    --model_name ${MODEL_NAME} \
+    --do_lower_case \
+    --vocab_file ${VOCAB_FILE} \
+    --output_dir ${OUT_DIR} \
+    --predict_file ${PREDICT_FILE} \
+    --batch_size ${BATCH_SIZE}"
+
+bash scripts/docker/launch.sh "${CMD}"
+
+bash scripts/docker/launch.sh "python ${SQUAD_DIR}/evaluate-v1.1.py ${PREDICT_FILE} ${OUT_DIR}/predictions.json"
--- a/triton/export_model.sh
+++ b/triton/export_model.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+NV_VISIBLE_DEVICES=${1:-"0"}
+DOCKER_BRIDGE=${2:-"host"}
+checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
+batch_size=${4:-"8"}
+BERT_DIR=${5:-"/workspace/bert"}
+EXPORT_FORMAT=${6:-"ts-script"}
+precision=${7:-"fp16"}
+triton_model_version=${8:-1}
+triton_model_name=${9:-"bertQA-ts-script"}
+triton_dyn_batching_delay=${10:-0}
+triton_engine_count=${11:-1}
+triton_model_overwrite=${12:-"False"}
+
+PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
+
+DEPLOYER="deployer.py"
+
+CMD="python triton/${DEPLOYER} \
+    --${EXPORT_FORMAT} \
+    --save-dir /results/triton_models \
+    --triton-model-name ${triton_model_name} \
+    --triton-model-version ${triton_model_version} \
+    --triton-max-batch-size ${batch_size} \
+    --triton-dyn-batching-delay ${triton_dyn_batching_delay} \
+    --triton-engine-count ${triton_engine_count} "
+
+CMD+="-- --checkpoint ${checkpoint} \
+    --config_file ${BERT_DIR}/bert_config.json \
+    --vocab_file /workspace/bert/vocab/vocab \
+    --predict_file ${PREDICT_FILE} \
+    --do_lower_case \
+    --batch_size=${batch_size} "
+
+if [[ $precision == "fp16" ]]; then
+    CMD+="--fp16 "
+fi
+
+bash scripts/docker/launch.sh "${CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE}
--- a/triton/generate_figures.sh
+++ b/triton/generate_figures.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export TRITON_MODEL_OVERWRITE=True
+NV_VISIBLE_DEVICES=0
+
+
+bert_model=${1:-"large"}
+precision=${2:-"fp32"}
+init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
+EXPORT_FORMAT=${4:-"ts-script"}
+
+PROFILING_DATA="triton/profiling_data_int64"
+
+MODEL_NAME="bert_${bert_model}_${precision}_${EXPORT_FORMAT}"
+BERT_DIR="/workspace/bert"
+# Create common bridge for client and server
+BRIDGE_NAME="tritonnet"
+docker network create ${BRIDGE_NAME}
+
+# Start Server
+echo Starting server...
+#bash triton/launch_triton_server.sh
+SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
+SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
+
+
+EXPORT_MODEL_ARGS="${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME}"
+PERF_CLIENT_ARGS="50000 10 20"
+
+# Restart Server
+restart_server() {
+docker kill trt_server_cont
+#bash triton/launch_triton_server.sh
+SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
+SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
+}
+
+# Clean up
+cleanup() {
+    docker kill trt_server_cont
+    docker network rm ${BRIDGE_NAME}
+}
+trap cleanup EXIT
+
+############## Dynamic Batching Comparison ##############
+SERVER_BATCH_SIZE=8
+CLIENT_BATCH_SIZE=1
+TRITON_ENGINE_COUNT=1
+TEST_NAME="DYN_BATCH_"
+
+# Dynamic batching 10 ms
+
+TRITON_DYN_BATCHING_DELAY=10
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# Dynamic batching 5 ms
+TRITON_DYN_BATCHING_DELAY=5
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# Dynamic batching 2 ms
+TRITON_DYN_BATCHING_DELAY=2
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+
+# Static Batching (i.e. Dynamic batching 0 ms)
+TRITON_DYN_BATCHING_DELAY=0
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+############## Engine Count Comparison ##############
+SERVER_BATCH_SIZE=1
+CLIENT_BATCH_SIZE=1
+TRITON_DYN_BATCHING_DELAY=0
+TEST_NAME="ENGINE_C_"
+
+# Engine Count = 4
+TRITON_ENGINE_COUNT=4
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# Engine Count = 2
+TRITON_ENGINE_COUNT=2
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# Engine Count = 1
+TRITON_ENGINE_COUNT=1
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+
+############## Batch Size Comparison ##############
+# BATCH=1 Generate model and perf
+SERVER_BATCH_SIZE=1
+CLIENT_BATCH_SIZE=1
+TRITON_ENGINE_COUNT=1
+TRITON_DYN_BATCHING_DELAY=0
+TEST_NAME="BATCH_SIZE_"
+
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 64 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# BATCH=2 Generate model and perf
+SERVER_BATCH_SIZE=2
+CLIENT_BATCH_SIZE=2
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 32 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# BATCH=4 Generate model and perf
+SERVER_BATCH_SIZE=4
+CLIENT_BATCH_SIZE=4
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 16 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
+
+# BATCH=8 Generate model and perf
+SERVER_BATCH_SIZE=8
+CLIENT_BATCH_SIZE=8
+./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
+restart_server
+sleep 15
+./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 8 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
--- a/triton/launch_triton_server.sh
+++ b/triton/launch_triton_server.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+DOCKER_BRIDGE=${1:-"bridge"}
+NV_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES:-"0"}
+
+# Start TRITON server in detached state
+docker run -d --rm \
+   --gpus device=${NV_VISIBLE_DEVICES} \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   --network=${DOCKER_BRIDGE} \
+   -p 8000:8000 \
+   -p 8001:8001 \
+   -p 8002:8002 \
+   --name trt_server_cont \
+   -v $PWD/results/triton_models:/models \
+   nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1
--- a/triton/profiling_data_int64/input__0
+++ b/triton/profiling_data_int64/input__0
--- a/triton/profiling_data_int64/input__1
+++ b/triton/profiling_data_int64/input__1
--- a/triton/profiling_data_int64/input__2
+++ b/triton/profiling_data_int64/input__2
--- a/triton/run_perf_client.sh
+++ b/triton/run_perf_client.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+MODEL_NAME=${1:-"bert"}
+MODEL_VERSION=${2:-1}
+precision=${3:-"fp32"}
+BATCH_SIZE=${4:-1}
+MAX_LATENCY=${5:-500}
+MAX_CLIENT_THREADS=${6:-10}
+MAX_CONCURRENCY=${7:-50}
+SERVER_HOSTNAME=${8:-"localhost"}
+DOCKER_BRIDGE=${9:-"host"}
+RESULTS_ID=${10:-""}
+PROFILING_DATA=${11:-"triton/profiling_data_int64"}
+NV_VISIBLE_DEVICES=${12:-"0"}
+
+if [[ $SERVER_HOSTNAME == *":"* ]]; then
+  echo "ERROR! Do not include the port when passing the Server Hostname. These scripts require that the TRITON HTTP endpoint is on Port 8000 and the gRPC endpoint is on Port 8001. Exiting..."
+  exit 1
+fi
+
+if [ "$SERVER_HOSTNAME" = "localhost" ]
+then
+    if [ ! "$(docker inspect -f "{{.State.Running}}" trt_server_cont)" = "true" ] ; then
+
+        echo "Launching TRITON server"
+        bash triton/launch_triton_server.sh ${DOCKER_BRIDGE} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES
+        SERVER_LAUNCHED=true
+
+        function cleanup_server {
+            docker kill trt_server_cont
+        }
+
+        # Ensure we cleanup the server on exit
+        # trap "exit" INT TERM
+        trap cleanup_server EXIT
+    fi
+fi
+
+# Wait until server is up. curl on the health of the server and sleep until its ready
+bash triton/wait_for_triton_server.sh $SERVER_HOSTNAME
+
+TIMESTAMP=$(date "+%y%m%d_%H%M")
+
+# Create model directory on host (directory /results is mounted)
+bash scripts/docker/launch.sh "mkdir -p /results/perf_client/${MODEL_NAME}"
+if [ ! -z "${RESULTS_ID}" ];
+then
+    RESULTS_ID="_${RESULTS_ID}"
+fi
+
+OUTPUT_FILE_CSV="/results/perf_client/${MODEL_NAME}/results${RESULTS_ID}_${TIMESTAMP}.csv"
+
+ARGS="\
+   --max-threads ${MAX_CLIENT_THREADS} \
+   -m ${MODEL_NAME} \
+   -x ${MODEL_VERSION} \
+   -p 3000 \
+   -d \
+   -v \
+   -i gRPC \
+   -u ${SERVER_HOSTNAME}:8001 \
+   -b ${BATCH_SIZE} \
+   -l ${MAX_LATENCY} \
+   -c ${MAX_CONCURRENCY} \
+   -f ${OUTPUT_FILE_CSV} \
+   --input-data ${PROFILING_DATA}"
+
+echo "Using args:  $(echo "$ARGS" | sed -e 's/   -/\n-/g')"
+bash scripts/docker/launch.sh "/workspace/install/bin/perf_client $ARGS" all $DOCKER_BRIDGE
--- a/triton/run_squad_client.py
+++ b/triton/run_squad_client.py
+#!/usr/bin/python
+
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import numpy as np
+import os
+import sys
+from builtins import range
+import collections
+from tqdm import tqdm
+import time
+from tensorrtserver.api import *
+
+sys.path.append('.')
+from run_squad import get_answers, convert_examples_to_features, read_squad_examples
+from tokenization import BertTokenizer
+import json
+import pickle
+
+
+args = None
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False,
+                        help='Enable verbose output')
+    parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8000',
+                        help='Inference server URL. Default is localhost:8000.')
+    parser.add_argument('-i', '--protocol', type=str, required=False, default='http',
+                        help='Protocol ("http"/"grpc") used to ' +
+                        'communicate with inference service. Default is "http".')
+    parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER",
+                        required=False, action='append',
+                        help='HTTP headers to add to inference server requests. ' +
+                        'Format is -H"Header:Value".')
+    parser.add_argument('--synchronous', action='store_true', help="Wait for previous request to finish before sending next request.")
+    
+    parser.add_argument("--model_name",
+                        type=str,
+                        default='bert',
+                        help="Specify model to run")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. \
+                        True for uncased models, False for cased models.")
+    parser.add_argument('--vocab_file',
+                        type=str, default=None, required=True,
+                        help="Vocabulary mapping/file BERT was pretrainined on")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--batch_size', default=1, type=int,
+                        help='Maximal number of examples in a batch')
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    
+    args = parser.parse_args()
+    
+    # TRITON client setup
+    protocol = ProtocolType.from_str(args.protocol)
+    
+    model_version = -1
+    infer_ctx = InferContext(args.url, protocol, args.model_name, model_version,
+                             http_headers=args.http_headers, verbose=args.verbose)
+    
+    # Preprocess input data
+    tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
+    cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
+
+    eval_examples = read_squad_examples(
+        input_file=args.predict_file,
+        is_training=False,
+        version_2_with_negative=args.version_2_with_negative)
+
+    try:
+        with open(cached_features_file, "rb") as reader:
+            eval_features = pickle.load(reader)
+    except:
+        eval_features = convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=False)
+        with open(cached_features_file, "wb") as writer:
+            pickle.dump(eval_features, writer)
+    
+    dtype = np.int64
+    
+    
+    def batch(iterable, n=1):
+        l = len(iterable)
+        for ndx in range(0, l, n):
+            unique_ids = ()
+            example_indices = ()
+            input_ids_data = ()
+            input_mask_data = ()
+            segment_ids_data = ()
+            for i in range(0, min(n, l-ndx)):
+                unique_ids = unique_ids + (iterable[ndx + i].unique_id,)
+                example_indices = example_indices + (ndx + i,)
+                input_ids_data = input_ids_data + (np.array(iterable[ndx + i].input_ids, dtype=dtype),)
+                input_mask_data = input_mask_data + (np.array(iterable[ndx + i].input_mask, dtype=dtype),)
+                segment_ids_data = segment_ids_data + (np.array(iterable[ndx + i].segment_ids, dtype=dtype),)
+            
+            inputs_dict = {'input__0': input_ids_data,
+                           'input__1': segment_ids_data,
+                           'input__2': input_mask_data}
+            yield inputs_dict, example_indices, unique_ids
+    
+    
+    RawResult = collections.namedtuple("RawResult",
+                                       ["unique_id", "start_logits", "end_logits"])
+    ExampleInfo = collections.namedtuple("ExampleInfo",
+                                   ["start_time", "batch_size", "example_ids", "unique_ids"])
+    all_results = []
+    time_list = []
+    outstanding = 0
+    sent_prog = tqdm(desc="Sending Requests", total=len(eval_features), file=sys.stdout, unit='sentences')
+    recv_prog = tqdm(desc="Processed Requests", total=len(eval_features), file=sys.stdout, unit='sentences')
+    if args.synchronous:
+        raw_results = []
+    
+    
+    def process_result_cb(example_info, ctx, request_id):
+        global outstanding
+        
+        result = infer_ctx.get_async_run_results(request_id)
+        stop = time.time()
+        outstanding -= 1
+        
+        time_list.append(stop - example_info.start_time)
+        
+        batch_count = example_info.batch_size
+        
+        for i in range(batch_count):
+            unique_id = int(example_info.unique_ids[i])
+            start_logits = [float(x) for x in result["output__0"][i].flat]
+            end_logits = [float(x) for x in result["output__1"][i].flat]
+            all_results.append(
+                RawResult(
+                    unique_id=unique_id,
+                    start_logits=start_logits,
+                    end_logits=end_logits))
+        
+        recv_prog.update(n=batch_count)
+    
+    
+    all_results_start = time.time()
+    
+    for input_dict, example_indices, unique_ids in batch(eval_features, args.batch_size):
+        current_bs = len(input_dict['input__0'])
+        outputs_dict = {'output__0': InferContext.ResultFormat.RAW,
+                        'output__1': InferContext.ResultFormat.RAW}
+        start = time.time()
+        example_info = ExampleInfo(start_time=start,
+                                   batch_size=current_bs,
+                                   example_ids=example_indices,
+                                   unique_ids=unique_ids
+                                   )
+        if not args.synchronous:
+            outstanding += 1
+            result_id = infer_ctx.async_run(partial(process_result_cb, example_info),
+                                                    input_dict,
+                                                    outputs_dict,
+                                                    batch_size=current_bs)
+        else:
+            result = infer_ctx.run(input_dict, outputs_dict, batch_size=current_bs)
+            raw_results.append((example_info, result))
+        sent_prog.update(n=current_bs)
+    
+    # Make sure that all sent requests have been processed
+    while outstanding > 0:
+        pass
+    
+    all_results_end = time.time()
+    all_results_total = (all_results_end - all_results_start) * 1000.0
+    num_batches = (len(eval_features) + args.batch_size - 1) // args.batch_size
+    
+    if args.synchronous:
+        for result in raw_results:
+            example_info, batch = result
+            for i in range(example_info.batch_size): 
+                unique_id = int(example_info.unique_ids[i])
+                start_logits = [float(x) for x in batch["output__0"][i].flat]
+                end_logits = [float(x) for x in batch["output__1"][i].flat]
+                all_results.append(
+                    RawResult(
+                        unique_id=unique_id,
+                        start_logits=start_logits,
+                        end_logits=end_logits))
+            recv_prog.update(n=example_info.batch_size)
+    
+    print("-----------------------------")
+    print("Individual Time Runs")
+    print("Total Time: {} ms".format(all_results_total))
+    print("-----------------------------")
+    
+    print("-----------------------------")
+    print("Total Inference Time = %0.2f for"
+          "Sentences processed = %d" % (sum(time_list), len(eval_features)))
+    print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0))
+    print("Throughput Average (batches/sec) = %0.2f" % (num_batches / all_results_total * 1000.0))
+    print("-----------------------------")
+    
+    if not args.synchronous:
+        time_list.sort()
+        
+        avg = np.mean(time_list)
+        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
+        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
+        cf_100 = max(time_list[:int(len(time_list) * 1)])
+        print("-----------------------------")
+        print("Summary Statistics")
+        print("Batch size =", args.batch_size)
+        print("Sequence Length =", args.max_seq_length)
+        print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
+        print("Latency Confidence Level 99 (ms)  =", cf_99 * 1000)
+        print("Latency Confidence Level 100 (ms)  =", cf_100 * 1000)
+        print("Latency Average (ms)  =", avg * 1000)
+        print("-----------------------------")
+    
+    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+    answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
+    with open(output_prediction_file, "w") as f:
+        f.write(json.dumps(answers, indent=4) + "\n")
+    with open(output_nbest_file, "w") as f:
+        f.write(json.dumps(nbest_answers, indent=4) + "\n")
+
--- a/triton/wait_for_triton_server.sh
+++ b/triton/wait_for_triton_server.sh
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. 
+
+SERVER_URI=${1:-"localhost"}
+
+echo "Waiting for TRITON Server to be ready at http://$SERVER_URI:8000..."
+
+live_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/live"
+ready_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/ready"
+
+current_status=$($live_command)
+echo $current_status
+
+# First check the current status. If that passes, check the json. If either fail, loop
+while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
+
+   printf "."
+   sleep 1
+   current_status=$($live_command)
+done
+
+echo "TRITON Server is ready!"
--- a/utils.py
+++ b/utils.py
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+
+from pathlib import Path
+
+
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def barrier():
+    if dist.is_available() and dist.is_initialized():
+        dist.barrier()
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Training Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Training Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    return s
+
+
+def mkdir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
+
+
+def mkdir_by_main_process(path):
+    if is_main_process():
+        mkdir(path)
+    barrier()
--- a/vocab/vocab
+++ b/vocab/vocab