"vscode:/vscode.git/clone" did not exist on "22f55e1b15287b5e7e46e49f523c95a4c93c996c"
Commit e5ca7e62 authored by hepj987's avatar hepj987
Browse files

初始化仓库

parents
#!/usr/bin/python
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os
import json
import argparse
import numpy as np
from builtins import range
from tensorrtserver.api import *
#
import sys
sys.path.append('../')
from inference import preprocess_tokenized_text, get_answer
from tokenization import BertTokenizer
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', type=int, default=1,
help='batch size for inference. default: 1')
parser.add_argument("--triton-model-name", type=str, default="model_name",
help="the name of the model used for inference")
parser.add_argument("--triton-model-version", type=int, default=-1,
help="the version of the model used for inference")
parser.add_argument("--triton-server-url", type=str, default="localhost:8000",
help="Inference server URL. Default is localhost:8000.")
parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False,
help='Enable verbose output')
parser.add_argument('--protocol', type=str, required=False, default='http',
help='Protocol ("http"/"grpc") used to ' +
'communicate with inference service. Default is "http".')
parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER",
required=False, action='append',
help='HTTP headers to add to inference server requests. ' +
'Format is -H"Header:Value".')
## pre- and postprocessing parameters
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. ")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--n_best_size", default=1, type=int,
help="The total number of n-best predictions to generate. ")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, then the model can reply with "unknown". ')
parser.add_argument('--null_score_diff_threshold',
type=float, default=-11.0,
help="If null_score - best_non_null is greater than the threshold predict 'unknown'. ")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
# input texts
parser.add_argument("--question", default="Most antibiotics target bacteria and don't affect what class of organisms? ",
type=str, help="question")
parser.add_argument("--context", default="Within the genitourinary and gastrointestinal tracts, commensal flora serve as biological barriers by competing with pathogenic bacteria for food and space and, in some cases, by changing the conditions in their environment, such as pH or available iron. This reduces the probability that pathogens will reach sufficient numbers to cause illness. However, since most antibiotics non-specifically target bacteria and do not affect fungi, oral antibiotics can lead to an overgrowth of fungi and cause conditions such as a vaginal candidiasis (a yeast infection). There is good evidence that re-introduction of probiotic flora, such as pure cultures of the lactobacilli normally found in unpasteurized yogurt, helps restore a healthy balance of microbial populations in intestinal infections in children and encouraging preliminary data in studies on bacterial gastroenteritis, inflammatory bowel diseases, urinary tract infection and post-surgical infections. ",
type=str, help="context")
args = parser.parse_args()
args.protocol = ProtocolType.from_str(args.protocol)
# Create a health context, get the ready and live state of server.
health_ctx = ServerHealthContext(args.triton_server_url, args.protocol,
http_headers=args.http_headers, verbose=args.verbose)
print("Health for model {}".format(args.triton_model_name))
print("Live: {}".format(health_ctx.is_live()))
print("Ready: {}".format(health_ctx.is_ready()))
# Create a status context and get server status
status_ctx = ServerStatusContext(args.triton_server_url, args.protocol, args.triton_model_name,
http_headers=args.http_headers, verbose=args.verbose)
print("Status for model {}".format(args.triton_model_name))
print(status_ctx.get_server_status())
# Create the inference context for the model.
infer_ctx = InferContext(args.triton_server_url, args.protocol, args.triton_model_name, args.triton_model_version,
http_headers=args.http_headers, verbose=args.verbose)
print("question: ", args.question)
print("context: ", args.context)
print()
# pre-processing
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
doc_tokens = args.context.split()
query_tokens = tokenizer.tokenize(args.question)
feature = preprocess_tokenized_text(doc_tokens,
query_tokens,
tokenizer,
max_seq_length=args.max_seq_length,
max_query_length=args.max_query_length)
tensors_for_inference, tokens_for_postprocessing = feature
dtype = np.int64
input_ids = np.array(tensors_for_inference.input_ids, dtype=dtype)[None,...] # make bs=1
segment_ids = np.array(tensors_for_inference.segment_ids, dtype=dtype)[None,...] # make bs=1
input_mask = np.array(tensors_for_inference.input_mask, dtype=dtype)[None,...] # make bs=1
assert args.batch_size == input_ids.shape[0]
assert args.batch_size == segment_ids.shape[0]
assert args.batch_size == input_mask.shape[0]
# prepare inputs
input_dict = {
"input__0" : tuple(input_ids[i] for i in range(args.batch_size)),
"input__1" : tuple(segment_ids[i] for i in range(args.batch_size)),
"input__2" : tuple(input_mask[i] for i in range(args.batch_size))
}
# prepare outputs
output_keys = [
"output__0",
"output__1"
]
output_dict = {}
for k in output_keys:
output_dict[k] = InferContext.ResultFormat.RAW
# Send inference request to the inference server.
result = infer_ctx.run(input_dict, output_dict, args.batch_size)
# get the result
start_logits = result["output__0"][0].tolist()
end_logits = result["output__1"][0].tolist()
# post-processing
answer, answers = get_answer(doc_tokens, tokens_for_postprocessing,
start_logits, end_logits, args)
# print result
print()
print(answer)
print()
print(json.dumps(answers, indent=4))
#!/usr/bin/python
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import pickle
import argparse
import deployer_lib
#
import sys
sys.path.append('../')
sys.path.append('.')
from modeling import BertForQuestionAnswering, BertConfig
from tokenization import BertTokenizer
from run_squad import convert_examples_to_features, read_squad_examples
def get_model_args(model_args):
''' the arguments initialize_model will receive '''
parser = argparse.ArgumentParser()
## Required parameters by the model.
parser.add_argument("--checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint of the model. ")
parser.add_argument('--batch_size',
default=8,
type=int,
help='Batch size for inference')
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument('--fp16',
action='store_true',
help="use mixed-precision")
parser.add_argument('--nbatches',
default=2,
type=int,
help='Number of batches in the inference dataloader. Default: 10. ')
return parser.parse_args(model_args)
def initialize_model(args):
''' return model, ready to trace '''
config = BertConfig.from_json_file(args.config_file)
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
model = BertForQuestionAnswering(config)
model.enable_apex(False)
state_dict = torch.load(args.checkpoint, map_location='cpu')["model"]
model.load_state_dict(state_dict)
if args.fp16:
model.half()
return model
def get_dataloader(args):
''' return dataloader for inference '''
# Preprocess input data
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
try:
with open(cached_features_file, "rb") as reader:
eval_features = pickle.load(reader)
except:
eval_examples = read_squad_examples(
input_file=args.predict_file,
is_training=False,
version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
with open(cached_features_file, "wb") as writer:
pickle.dump(eval_features, writer)
data = []
for feature in eval_features:
input_ids = torch.tensor(feature.input_ids, dtype=torch.int64)
input_mask = torch.tensor(feature.input_mask, dtype=torch.int64)
segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64)
inp = (input_ids, segment_ids, input_mask)
data.append(inp)
if args.nbatches > 0:
data = data[:args.nbatches*args.batch_size]
test_loader = torch.utils.data.DataLoader(
data,
batch_size=args.batch_size,
shuffle=False,
num_workers=1,
pin_memory=True)
return test_loader
if __name__=='__main__':
# don't touch this!
deployer, model_argv = deployer_lib.create_deployer(sys.argv[1:]) # deployer and returns removed deployer arguments
model_args = get_model_args(model_argv)
model = initialize_model(model_args)
dataloader = get_dataloader(model_args)
deployer.deploy(dataloader, model)
#!/usr/bin/python
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import time
import json
import torch
import argparse
import statistics
from collections import Counter
torch_type_to_triton_type = {
torch.bool: 'TYPE_BOOL',
torch.int8: 'TYPE_INT8',
torch.int16: 'TYPE_INT16',
torch.int32: 'TYPE_INT32',
torch.int64: 'TYPE_INT64',
torch.uint8: 'TYPE_UINT8',
torch.float16: 'TYPE_FP16',
torch.float32: 'TYPE_FP32',
torch.float64: 'TYPE_FP64'
}
CONFIG_TEMPLATE = r"""
name: "{model_name}"
platform: "{platform}"
max_batch_size: {max_batch_size}
input [
{spec_inputs}
]
output [
{spec_outputs}
]
{dynamic_batching}
{model_optimizations}
instance_group [
{{
count: {engine_count}
kind: KIND_GPU
gpus: [ {gpu_list} ]
}}
]"""
INPUT_TEMPLATE = r"""
{{
name: "input__{num}"
data_type: {type}
dims: {dims}
{reshape}
}},"""
OUTPUT_TEMPLATE = r"""
{{
name: "output__{num}"
data_type: {type}
dims: {dims}
{reshape}
}},"""
MODEL_OPTIMIZATION_TEMPLATE = r"""
optimization {{
cuda {{
graphs: {capture_cuda_graph}
}}
}}"""
def remove_empty_lines(text):
''' removes empty lines from text, returns the result '''
ret = "".join([s for s in text.strip().splitlines(True) if s.strip()])
return ret
def create_deployer(argv):
''' takes a list of arguments, returns a deployer object and the list of unused arguments '''
parser = argparse.ArgumentParser()
# required args
method = parser.add_mutually_exclusive_group(required=True)
method.add_argument('--ts-script',
action='store_true',
help='convert to torchscript using torch.jit.script')
method.add_argument('--ts-trace',
action='store_true',
help='convert to torchscript using torch.jit.trace')
method.add_argument('--onnx',
action='store_true',
help='convert to onnx using torch.onnx.export')
# triton related args
arguments = parser.add_argument_group('triton related flags')
arguments.add_argument('--triton-no-cuda',
action='store_true',
help='Use the CPU for tracing.')
arguments.add_argument('--triton-model-name',
type=str,
default="model",
help="exports to appropriate directory structure for Triton")
arguments.add_argument("--triton-model-version",
type=int,
default=1,
help="exports to appropriate directory structure for Triton")
arguments.add_argument("--triton-server-url",
type=str,
default="localhost:8001",
help="exports to appropriate directory structure for Triton")
arguments.add_argument("--triton-max-batch-size",
type=int,
default=8,
help="Specifies the 'max_batch_size' in the Triton model config.\
See the Triton documentation for more info.")
arguments.add_argument("--triton-dyn-batching-delay",
type=float,
default=0,
help="Determines the dynamic_batching queue delay in milliseconds(ms) for\
the Triton model config. Use '0' or '-1' to specify static batching.\
See the Triton documentation for more info.")
arguments.add_argument("--triton-engine-count",
type=int,
default=1,
help="Specifies the 'instance_group' count value in the Triton model config.\
See the Triton documentation for more info.")
arguments.add_argument('--save-dir', type=str, default='./triton_models', help='Saved model directory')
# optimization args
arguments = parser.add_argument_group('optimization flags')
arguments.add_argument("--capture-cuda-graph",
type=int,
default=0,
help="capture cuda graph for obtaining speedup. possible values: 0, 1. default: 0 (automatic). ")
# remainder args
arguments.add_argument('model_arguments', nargs=argparse.REMAINDER, help='arguments that will be ignored by deployer lib and will be forwarded to your deployer script')
#
args = parser.parse_args(argv)
deployer = Deployer(args)
#
return deployer, args.model_arguments[1:]
class DeployerLibrary:
def __init__(self, args):
self.args = args
self.platform = None
def set_platform(self, platform):
''' sets the platform
:: platform :: "pytorch_libtorch" or "onnxruntime_onnx" or "tensorrt_plan"
'''
self.platform = platform
def prepare_inputs(self, dataloader, device):
''' load sample inputs to device '''
inputs = []
for batch in dataloader:
if type(batch) is torch.Tensor:
batch_d = batch.to(device)
batch_d = (batch_d,)
inputs.append(batch_d)
else:
batch_d = []
for x in batch:
assert type(x) is torch.Tensor, "input is not a tensor"
batch_d.append(x.to(device))
batch_d = tuple(batch_d)
inputs.append(batch_d)
return inputs
def get_list_of_shapes(self, l, fun):
''' returns the list of min/max shapes, depending on fun
:: l :: list of tuples of tensors
:: fun :: min or max
'''
tensor_tuple = l[0]
shapes = [list(x.shape) for x in tensor_tuple]
for tensor_tuple in l:
assert len(tensor_tuple) == len(shapes), "tensors with varying shape lengths are not supported"
for i,x in enumerate(tensor_tuple):
for j in range(len(x.shape)):
shapes[i][j] = fun(shapes[i][j], x.shape[j])
return shapes # a list of shapes
def get_tuple_of_min_shapes(self, l):
''' returns the tuple of min shapes
:: l :: list of tuples of tensors '''
shapes = self.get_list_of_shapes(l, min)
min_batch = 1
shapes = [[min_batch,*shape[1:]] for shape in shapes]
shapes = tuple(shapes)
return shapes # tuple of min shapes
def get_tuple_of_max_shapes(self, l):
''' returns the tuple of max shapes
:: l :: list of tuples of tensors '''
shapes = self.get_list_of_shapes(l, max)
max_batch = max(2,shapes[0][0])
shapes = [[max_batch,*shape[1:]] for shape in shapes]
shapes = tuple(shapes)
return shapes # tuple of max shapes
def get_tuple_of_opt_shapes(self, l):
''' returns the tuple of opt shapes
:: l :: list of tuples of tensors '''
counter = Counter()
for tensor_tuple in l:
shapes = [tuple(x.shape) for x in tensor_tuple]
shapes = tuple(shapes)
counter[shapes] += 1
shapes = counter.most_common(1)[0][0]
return shapes # tuple of most common occuring shapes
def get_tuple_of_dynamic_shapes(self, l):
''' returns a tuple of dynamic shapes: variable tensor dimensions
(for ex. batch size) occur as -1 in the tuple
:: l :: list of tuples of tensors '''
tensor_tuple = l[0]
shapes = [list(x.shape) for x in tensor_tuple]
for tensor_tuple in l:
err_msg = "tensors with varying shape lengths are not supported"
assert len(tensor_tuple) == len(shapes), err_msg
for i,x in enumerate(tensor_tuple):
for j in range(len(x.shape)):
if shapes[i][j] != x.shape[j] or j == 0:
shapes[i][j] = -1
shapes = tuple(shapes)
return shapes # tuple of dynamic shapes
def run_models(self, models, inputs):
''' run the models on inputs, return the outputs and execution times '''
ret = []
for model in models:
torch.cuda.synchronize()
time_start = time.time()
outputs = []
for input in inputs:
with torch.no_grad():
output = model(*input)
if type(output) is torch.Tensor:
output = [output]
outputs.append(output)
torch.cuda.synchronize()
time_end = time.time()
t = time_end - time_start
ret.append(outputs)
ret.append(t)
return ret
def compute_errors(self, outputs_A, outputs_B):
''' returns the list of L_inf errors computed over every single output tensor '''
Linf_errors = []
for output_A,output_B in zip(outputs_A,outputs_B):
for x,y in zip(output_A, output_B):
error = (x - y).norm(float('inf')).item()
Linf_errors.append(error)
return Linf_errors
def print_errors(self, Linf_errors):
''' print various statistcs of Linf errors '''
print()
print("conversion correctness test results")
print("-----------------------------------")
print("maximal absolute error over dataset (L_inf): ", max(Linf_errors))
print()
print("average L_inf error over output tensors: ", statistics.mean(Linf_errors))
print("variance of L_inf error over output tensors: ", statistics.variance(Linf_errors))
print("stddev of L_inf error over output tensors: ", statistics.stdev(Linf_errors))
print()
def write_config(self, config_filename,
input_shapes, input_types,
output_shapes, output_types):
''' writes Triton config file
:: config_filename :: the file to write the config file into
:: input_shapes :: tuple of dynamic shapes of the input tensors
:: input_types :: tuple of torch types of the input tensors
:: output_shapes :: tuple of dynamic shapes of the output tensors
:: output_types :: tuple of torch types of the output tensors
'''
assert self.platform is not None, "error - platform is not set"
config_template = CONFIG_TEMPLATE
input_template = INPUT_TEMPLATE
optimization_template = MODEL_OPTIMIZATION_TEMPLATE
spec_inputs = r""""""
for i,(shape,typ) in enumerate(zip(input_shapes,input_types)):
d = {
'num' : str(i),
'type': torch_type_to_triton_type[typ],
'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size
}
d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
spec_inputs += input_template.format_map(d)
spec_inputs = spec_inputs[:-1]
output_template = OUTPUT_TEMPLATE
spec_outputs = r""""""
for i,(shape,typ) in enumerate(zip(output_shapes,output_types)):
d = {
'num' : str(i),
'type': torch_type_to_triton_type[typ],
'dims': str([1]) if len(shape) == 1 else str(list(shape)[1:]) # first dimension is the batch size
}
d['reshape'] = 'reshape: { shape: [ ] }' if len(shape) == 1 else ''
spec_outputs += output_template.format_map(d)
spec_outputs = spec_outputs[:-1]
batching_str = ""
max_batch_size = self.args.triton_max_batch_size
if (self.args.triton_dyn_batching_delay > 0):
# Use only full and half full batches
pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
batching_str = r"""
dynamic_batching {{
preferred_batch_size: [{0}]
max_queue_delay_microseconds: {1}
}}""".format(", ".join([str(x) for x in pref_batch_size]),
int(self.args.triton_dyn_batching_delay * 1000.0))
d = {
"capture_cuda_graph": str(self.args.capture_cuda_graph)
}
optimization_str = optimization_template.format_map(d)
config_values = {
"model_name": self.args.triton_model_name,
"platform": self.platform,
"max_batch_size": max_batch_size,
"spec_inputs": spec_inputs,
"spec_outputs": spec_outputs,
"dynamic_batching": batching_str,
"model_optimizations" : optimization_str,
"gpu_list": ", ".join([str(x) for x in range(torch.cuda.device_count())]),
"engine_count": self.args.triton_engine_count
}
# write config
with open(config_filename, "w") as file:
final_config_str = config_template.format_map(config_values)
final_config_str = remove_empty_lines(final_config_str)
file.write(final_config_str)
class Deployer:
def __init__(self, args):
self.args = args
self.lib = DeployerLibrary(args)
def deploy(self, dataloader, model):
''' deploy the model and test for correctness with dataloader '''
if self.args.ts_script or self.args.ts_trace:
self.lib.set_platform("pytorch_libtorch")
print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
self.to_triton_torchscript(dataloader, model)
elif self.args.onnx:
self.lib.set_platform("onnxruntime_onnx")
print("deploying model " + self.args.triton_model_name + " in format " + self.lib.platform)
self.to_triton_onnx(dataloader, model)
else:
assert False, "error"
print("done")
def to_triton_onnx(self, dataloader, model):
''' export the model to onnx and test correctness on dataloader '''
import onnx
import onnxruntime
# setup device
if self.args.triton_no_cuda:
device = torch.device('cpu')
else:
device = torch.device('cuda')
# prepare model
model.to(device)
model.eval()
assert not model.training, "internal error - model should be in eval() mode! "
# prepare inputs
inputs = self.lib.prepare_inputs(dataloader, device)
# generate outputs
outputs = []
for input in inputs:
with torch.no_grad():
output = model(*input)
if type(output) is torch.Tensor:
output = [output]
outputs.append(output)
# generate input shapes - dynamic tensor shape support
input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
# generate output shapes - dynamic tensor shape support
output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
# generate input types
input_types = [x.dtype for x in inputs[0]]
# generate output types
output_types = [x.dtype for x in outputs[0]]
# get input names
rng = range(len(input_types))
input_names = ["input__" + str(num) for num in rng]
# get output names
rng = range(len(output_types))
output_names = ["output__" + str(num) for num in rng]
# prepare save path
model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
if not os.path.exists(version_folder):
os.makedirs(version_folder)
final_model_path = os.path.join(version_folder, 'model.onnx')
# get indices of dynamic input and output shapes
dynamic_axes = {}
for input_name,input_shape in zip(input_names,input_shapes):
dynamic_axes[input_name] = [i for i,x in enumerate(input_shape) if x == -1]
for output_name,output_shape in zip(output_names,output_shapes):
dynamic_axes[output_name] = [i for i,x in enumerate(output_shape) if x == -1]
# export the model
assert not model.training, "internal error - model should be in eval() mode! "
with torch.no_grad():
torch.onnx.export(model, inputs[0], final_model_path, verbose=False,
input_names=input_names, output_names=output_names,
dynamic_axes=dynamic_axes, opset_version=11)
# syntactic error check
converted_model = onnx.load(final_model_path)
# check that the IR is well formed
onnx.checker.check_model(converted_model)
# load the model
session = onnxruntime.InferenceSession(final_model_path, None)
class ONNX_model:
def __init__(self, session, input_names, device):
self.session = session
self.input_names = input_names
def to_numpy(self, tensor):
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
def __call__(self, *inputs):
inp = [(input_name, inputs[i]) for i,input_name in enumerate(self.input_names)]
inp = {input_name : self.to_numpy(x) for input_name,x in inp}
outputs = self.session.run(None, inp)
outputs = [torch.from_numpy(output) for output in outputs]
outputs = [output.to(device) for output in outputs]
if len(outputs) == 1:
outputs = outputs[0]
return outputs
# switch to eval mode
model_onnx = ONNX_model(session, input_names, device)
# run both models on inputs
assert not model.training, "internal error - model should be in eval() mode! "
models = (model, model_onnx)
outputs, time_model, outputs_onnx, time_model_onnx = self.lib.run_models(models, inputs)
# check for errors
Linf_errors = self.lib.compute_errors(outputs, outputs_onnx)
self.lib.print_errors(Linf_errors)
print('time of error check of native model: ', time_model, 'seconds')
print('time of error check of onnx model: ', time_model_onnx, 'seconds')
print()
# write Triton config
config_filename = os.path.join(model_folder, "config.pbtxt")
self.lib.write_config(config_filename,
input_shapes, input_types,
output_shapes, output_types)
def to_triton_torchscript(self, dataloader, model):
''' export the model to torchscript and test correctness on dataloader '''
# setup device
if self.args.triton_no_cuda:
device = torch.device('cpu')
else:
device = torch.device('cuda')
# prepare model
model.to(device)
model.eval()
assert not model.training, "internal error - model should be in eval() mode! "
# prepare inputs
inputs = self.lib.prepare_inputs(dataloader, device)
# generate input shapes - dynamic tensor shape support
input_shapes = self.lib.get_tuple_of_dynamic_shapes(inputs)
# generate input types
input_types = [x.dtype for x in inputs[0]]
# prepare save path
model_folder = os.path.join(self.args.save_dir, self.args.triton_model_name)
version_folder = os.path.join(model_folder, str(self.args.triton_model_version))
if not os.path.exists(version_folder):
os.makedirs(version_folder)
final_model_path = os.path.join(version_folder, 'model.pt')
# convert the model
with torch.no_grad():
if self.args.ts_trace: # trace it
model_ts = torch.jit.trace(model, inputs[0])
if self.args.ts_script: # script it
model_ts = torch.jit.script(model)
# save the model
torch.jit.save(model_ts, final_model_path)
# load the model
model_ts = torch.jit.load(final_model_path)
model_ts.eval() # WAR for bug : by default, model_ts gets loaded in training mode
# run both models on inputs
assert not model.training, "internal error - model should be in eval() mode! "
assert not model_ts.training, "internal error - converted model should be in eval() mode! "
models = (model, model_ts)
outputs, time_model, outputs_ts, time_model_ts = self.lib.run_models(models, inputs)
# check for errors
Linf_errors = self.lib.compute_errors(outputs, outputs_ts)
self.lib.print_errors(Linf_errors)
print('time of error check of native model: ', time_model, 'seconds')
print('time of error check of ts model: ', time_model_ts, 'seconds')
print()
# generate output shapes - dynamic tensor shape support
output_shapes = self.lib.get_tuple_of_dynamic_shapes(outputs)
# generate output types
output_types = [x.dtype for x in outputs[0]]
# now we build the config for Triton
config_filename = os.path.join(model_folder, "config.pbtxt")
self.lib.write_config(config_filename,
input_shapes, input_types,
output_shapes, output_types)
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export TRITON_MODEL_OVERWRITE=True
NV_VISIBLE_DEVICES=0
bert_model=${1:-"large"}
precision=${2:-"fp32"}
init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
EXPORT_FORMAT=${4:-"ts-script"}
MODEL_NAME="bert_${bert_model}_${precision}"
BERT_DIR="/workspace/bert"
VOCAB_FILE="/workspace/bert/vocab/vocab"
PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
SQUAD_DIR="/workspace/bert/data/squad/v1.1"
OUT_DIR="/results"
BATCH_SIZE="8"
# Create common bridge for client and server
BRIDGE_NAME="tritonnet"
docker network create ${BRIDGE_NAME}
EXPORT_MODEL_ARGS="${BATCH_SIZE} ${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME} 0 1"
# Clean up
cleanup() {
docker kill trt_server_cont
docker network rm ${BRIDGE_NAME}
}
trap cleanup EXIT
trap cleanup SIGTERM
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${EXPORT_MODEL_ARGS} ${TRITON_MODEL_OVERWRITE}
# Start Server
echo Starting server...
SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
./triton/wait_for_triton_server.sh
CMD="python triton/run_squad_client.py \
--model_name ${MODEL_NAME} \
--do_lower_case \
--vocab_file ${VOCAB_FILE} \
--output_dir ${OUT_DIR} \
--predict_file ${PREDICT_FILE} \
--batch_size ${BATCH_SIZE}"
bash scripts/docker/launch.sh "${CMD}"
bash scripts/docker/launch.sh "python ${SQUAD_DIR}/evaluate-v1.1.py ${PREDICT_FILE} ${OUT_DIR}/predictions.json"
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
NV_VISIBLE_DEVICES=${1:-"0"}
DOCKER_BRIDGE=${2:-"host"}
checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
batch_size=${4:-"8"}
BERT_DIR=${5:-"/workspace/bert"}
EXPORT_FORMAT=${6:-"ts-script"}
precision=${7:-"fp16"}
triton_model_version=${8:-1}
triton_model_name=${9:-"bertQA-ts-script"}
triton_dyn_batching_delay=${10:-0}
triton_engine_count=${11:-1}
triton_model_overwrite=${12:-"False"}
PREDICT_FILE="/workspace/bert/data/squad/v1.1/dev-v1.1.json"
DEPLOYER="deployer.py"
CMD="python triton/${DEPLOYER} \
--${EXPORT_FORMAT} \
--save-dir /results/triton_models \
--triton-model-name ${triton_model_name} \
--triton-model-version ${triton_model_version} \
--triton-max-batch-size ${batch_size} \
--triton-dyn-batching-delay ${triton_dyn_batching_delay} \
--triton-engine-count ${triton_engine_count} "
CMD+="-- --checkpoint ${checkpoint} \
--config_file ${BERT_DIR}/bert_config.json \
--vocab_file /workspace/bert/vocab/vocab \
--predict_file ${PREDICT_FILE} \
--do_lower_case \
--batch_size=${batch_size} "
if [[ $precision == "fp16" ]]; then
CMD+="--fp16 "
fi
bash scripts/docker/launch.sh "${CMD}" ${NV_VISIBLE_DEVICES} ${DOCKER_BRIDGE}
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export TRITON_MODEL_OVERWRITE=True
NV_VISIBLE_DEVICES=0
bert_model=${1:-"large"}
precision=${2:-"fp32"}
init_checkpoint=${3:-"/workspace/bert/checkpoints/bert_qa.pt"}
EXPORT_FORMAT=${4:-"ts-script"}
PROFILING_DATA="triton/profiling_data_int64"
MODEL_NAME="bert_${bert_model}_${precision}_${EXPORT_FORMAT}"
BERT_DIR="/workspace/bert"
# Create common bridge for client and server
BRIDGE_NAME="tritonnet"
docker network create ${BRIDGE_NAME}
# Start Server
echo Starting server...
#bash triton/launch_triton_server.sh
SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
EXPORT_MODEL_ARGS="${BERT_DIR} ${EXPORT_FORMAT} ${precision} 1 ${MODEL_NAME}"
PERF_CLIENT_ARGS="50000 10 20"
# Restart Server
restart_server() {
docker kill trt_server_cont
#bash triton/launch_triton_server.sh
SERVER_ID=$( ./triton/launch_triton_server.sh ${BRIDGE_NAME} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES )
SERVER_IP=$( docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' ${SERVER_ID} )
}
# Clean up
cleanup() {
docker kill trt_server_cont
docker network rm ${BRIDGE_NAME}
}
trap cleanup EXIT
############## Dynamic Batching Comparison ##############
SERVER_BATCH_SIZE=8
CLIENT_BATCH_SIZE=1
TRITON_ENGINE_COUNT=1
TEST_NAME="DYN_BATCH_"
# Dynamic batching 10 ms
TRITON_DYN_BATCHING_DELAY=10
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# Dynamic batching 5 ms
TRITON_DYN_BATCHING_DELAY=5
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# Dynamic batching 2 ms
TRITON_DYN_BATCHING_DELAY=2
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# Static Batching (i.e. Dynamic batching 0 ms)
TRITON_DYN_BATCHING_DELAY=0
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_DYN_BATCHING_DELAY} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
############## Engine Count Comparison ##############
SERVER_BATCH_SIZE=1
CLIENT_BATCH_SIZE=1
TRITON_DYN_BATCHING_DELAY=0
TEST_NAME="ENGINE_C_"
# Engine Count = 4
TRITON_ENGINE_COUNT=4
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# Engine Count = 2
TRITON_ENGINE_COUNT=2
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# Engine Count = 1
TRITON_ENGINE_COUNT=1
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} ${PERF_CLIENT_ARGS} ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${TRITON_ENGINE_COUNT} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
############## Batch Size Comparison ##############
# BATCH=1 Generate model and perf
SERVER_BATCH_SIZE=1
CLIENT_BATCH_SIZE=1
TRITON_ENGINE_COUNT=1
TRITON_DYN_BATCHING_DELAY=0
TEST_NAME="BATCH_SIZE_"
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 64 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# BATCH=2 Generate model and perf
SERVER_BATCH_SIZE=2
CLIENT_BATCH_SIZE=2
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 32 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# BATCH=4 Generate model and perf
SERVER_BATCH_SIZE=4
CLIENT_BATCH_SIZE=4
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 16 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
# BATCH=8 Generate model and perf
SERVER_BATCH_SIZE=8
CLIENT_BATCH_SIZE=8
./triton/export_model.sh ${NV_VISIBLE_DEVICES} ${BRIDGE_NAME} ${init_checkpoint} ${SERVER_BATCH_SIZE} ${EXPORT_MODEL_ARGS} ${TRITON_DYN_BATCHING_DELAY} ${TRITON_ENGINE_COUNT} ${TRITON_MODEL_OVERWRITE}
restart_server
sleep 15
./triton/run_perf_client.sh ${MODEL_NAME} 1 ${precision} ${CLIENT_BATCH_SIZE} 1000 10 8 ${SERVER_IP} ${BRIDGE_NAME} ${TEST_NAME}${SERVER_BATCH_SIZE} ${PROFILING_DATA} ${NV_VISIBLE_DEVICES}
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DOCKER_BRIDGE=${1:-"bridge"}
NV_VISIBLE_DEVICES=${NV_VISIBLE_DEVICES:-"0"}
# Start TRITON server in detached state
docker run -d --rm \
--gpus device=${NV_VISIBLE_DEVICES} \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--network=${DOCKER_BRIDGE} \
-p 8000:8000 \
-p 8001:8001 \
-p 8002:8002 \
--name trt_server_cont \
-v $PWD/results/triton_models:/models \
nvcr.io/nvidia/tritonserver:20.06-v1-py3 trtserver --model-store=/models --log-verbose=1
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MODEL_NAME=${1:-"bert"}
MODEL_VERSION=${2:-1}
precision=${3:-"fp32"}
BATCH_SIZE=${4:-1}
MAX_LATENCY=${5:-500}
MAX_CLIENT_THREADS=${6:-10}
MAX_CONCURRENCY=${7:-50}
SERVER_HOSTNAME=${8:-"localhost"}
DOCKER_BRIDGE=${9:-"host"}
RESULTS_ID=${10:-""}
PROFILING_DATA=${11:-"triton/profiling_data_int64"}
NV_VISIBLE_DEVICES=${12:-"0"}
if [[ $SERVER_HOSTNAME == *":"* ]]; then
echo "ERROR! Do not include the port when passing the Server Hostname. These scripts require that the TRITON HTTP endpoint is on Port 8000 and the gRPC endpoint is on Port 8001. Exiting..."
exit 1
fi
if [ "$SERVER_HOSTNAME" = "localhost" ]
then
if [ ! "$(docker inspect -f "{{.State.Running}}" trt_server_cont)" = "true" ] ; then
echo "Launching TRITON server"
bash triton/launch_triton_server.sh ${DOCKER_BRIDGE} --NV_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES
SERVER_LAUNCHED=true
function cleanup_server {
docker kill trt_server_cont
}
# Ensure we cleanup the server on exit
# trap "exit" INT TERM
trap cleanup_server EXIT
fi
fi
# Wait until server is up. curl on the health of the server and sleep until its ready
bash triton/wait_for_triton_server.sh $SERVER_HOSTNAME
TIMESTAMP=$(date "+%y%m%d_%H%M")
# Create model directory on host (directory /results is mounted)
bash scripts/docker/launch.sh "mkdir -p /results/perf_client/${MODEL_NAME}"
if [ ! -z "${RESULTS_ID}" ];
then
RESULTS_ID="_${RESULTS_ID}"
fi
OUTPUT_FILE_CSV="/results/perf_client/${MODEL_NAME}/results${RESULTS_ID}_${TIMESTAMP}.csv"
ARGS="\
--max-threads ${MAX_CLIENT_THREADS} \
-m ${MODEL_NAME} \
-x ${MODEL_VERSION} \
-p 3000 \
-d \
-v \
-i gRPC \
-u ${SERVER_HOSTNAME}:8001 \
-b ${BATCH_SIZE} \
-l ${MAX_LATENCY} \
-c ${MAX_CONCURRENCY} \
-f ${OUTPUT_FILE_CSV} \
--input-data ${PROFILING_DATA}"
echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')"
bash scripts/docker/launch.sh "/workspace/install/bin/perf_client $ARGS" all $DOCKER_BRIDGE
#!/usr/bin/python
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import argparse
import numpy as np
import os
import sys
from builtins import range
import collections
from tqdm import tqdm
import time
from tensorrtserver.api import *
sys.path.append('.')
from run_squad import get_answers, convert_examples_to_features, read_squad_examples
from tokenization import BertTokenizer
import json
import pickle
args = None
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False,
help='Enable verbose output')
parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8000',
help='Inference server URL. Default is localhost:8000.')
parser.add_argument('-i', '--protocol', type=str, required=False, default='http',
help='Protocol ("http"/"grpc") used to ' +
'communicate with inference service. Default is "http".')
parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER",
required=False, action='append',
help='HTTP headers to add to inference server requests. ' +
'Format is -H"Header:Value".')
parser.add_argument('--synchronous', action='store_true', help="Wait for previous request to finish before sending next request.")
parser.add_argument("--model_name",
type=str,
default='bert',
help="Specify model to run")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. \
True for uncased models, False for cased models.")
parser.add_argument('--vocab_file',
type=str, default=None, required=True,
help="Vocabulary mapping/file BERT was pretrainined on")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--batch_size', default=1, type=int,
help='Maximal number of examples in a batch')
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
args = parser.parse_args()
# TRITON client setup
protocol = ProtocolType.from_str(args.protocol)
model_version = -1
infer_ctx = InferContext(args.url, protocol, args.model_name, model_version,
http_headers=args.http_headers, verbose=args.verbose)
# Preprocess input data
tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride)
eval_examples = read_squad_examples(
input_file=args.predict_file,
is_training=False,
version_2_with_negative=args.version_2_with_negative)
try:
with open(cached_features_file, "rb") as reader:
eval_features = pickle.load(reader)
except:
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
with open(cached_features_file, "wb") as writer:
pickle.dump(eval_features, writer)
dtype = np.int64
def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
unique_ids = ()
example_indices = ()
input_ids_data = ()
input_mask_data = ()
segment_ids_data = ()
for i in range(0, min(n, l-ndx)):
unique_ids = unique_ids + (iterable[ndx + i].unique_id,)
example_indices = example_indices + (ndx + i,)
input_ids_data = input_ids_data + (np.array(iterable[ndx + i].input_ids, dtype=dtype),)
input_mask_data = input_mask_data + (np.array(iterable[ndx + i].input_mask, dtype=dtype),)
segment_ids_data = segment_ids_data + (np.array(iterable[ndx + i].segment_ids, dtype=dtype),)
inputs_dict = {'input__0': input_ids_data,
'input__1': segment_ids_data,
'input__2': input_mask_data}
yield inputs_dict, example_indices, unique_ids
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
ExampleInfo = collections.namedtuple("ExampleInfo",
["start_time", "batch_size", "example_ids", "unique_ids"])
all_results = []
time_list = []
outstanding = 0
sent_prog = tqdm(desc="Sending Requests", total=len(eval_features), file=sys.stdout, unit='sentences')
recv_prog = tqdm(desc="Processed Requests", total=len(eval_features), file=sys.stdout, unit='sentences')
if args.synchronous:
raw_results = []
def process_result_cb(example_info, ctx, request_id):
global outstanding
result = infer_ctx.get_async_run_results(request_id)
stop = time.time()
outstanding -= 1
time_list.append(stop - example_info.start_time)
batch_count = example_info.batch_size
for i in range(batch_count):
unique_id = int(example_info.unique_ids[i])
start_logits = [float(x) for x in result["output__0"][i].flat]
end_logits = [float(x) for x in result["output__1"][i].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
recv_prog.update(n=batch_count)
all_results_start = time.time()
for input_dict, example_indices, unique_ids in batch(eval_features, args.batch_size):
current_bs = len(input_dict['input__0'])
outputs_dict = {'output__0': InferContext.ResultFormat.RAW,
'output__1': InferContext.ResultFormat.RAW}
start = time.time()
example_info = ExampleInfo(start_time=start,
batch_size=current_bs,
example_ids=example_indices,
unique_ids=unique_ids
)
if not args.synchronous:
outstanding += 1
result_id = infer_ctx.async_run(partial(process_result_cb, example_info),
input_dict,
outputs_dict,
batch_size=current_bs)
else:
result = infer_ctx.run(input_dict, outputs_dict, batch_size=current_bs)
raw_results.append((example_info, result))
sent_prog.update(n=current_bs)
# Make sure that all sent requests have been processed
while outstanding > 0:
pass
all_results_end = time.time()
all_results_total = (all_results_end - all_results_start) * 1000.0
num_batches = (len(eval_features) + args.batch_size - 1) // args.batch_size
if args.synchronous:
for result in raw_results:
example_info, batch = result
for i in range(example_info.batch_size):
unique_id = int(example_info.unique_ids[i])
start_logits = [float(x) for x in batch["output__0"][i].flat]
end_logits = [float(x) for x in batch["output__1"][i].flat]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
recv_prog.update(n=example_info.batch_size)
print("-----------------------------")
print("Individual Time Runs")
print("Total Time: {} ms".format(all_results_total))
print("-----------------------------")
print("-----------------------------")
print("Total Inference Time = %0.2f for"
"Sentences processed = %d" % (sum(time_list), len(eval_features)))
print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0))
print("Throughput Average (batches/sec) = %0.2f" % (num_batches / all_results_total * 1000.0))
print("-----------------------------")
if not args.synchronous:
time_list.sort()
avg = np.mean(time_list)
cf_95 = max(time_list[:int(len(time_list) * 0.95)])
cf_99 = max(time_list[:int(len(time_list) * 0.99)])
cf_100 = max(time_list[:int(len(time_list) * 1)])
print("-----------------------------")
print("Summary Statistics")
print("Batch size =", args.batch_size)
print("Sequence Length =", args.max_seq_length)
print("Latency Confidence Level 95 (ms) =", cf_95 * 1000)
print("Latency Confidence Level 99 (ms) =", cf_99 * 1000)
print("Latency Confidence Level 100 (ms) =", cf_100 * 1000)
print("Latency Average (ms) =", avg * 1000)
print("-----------------------------")
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args)
with open(output_prediction_file, "w") as f:
f.write(json.dumps(answers, indent=4) + "\n")
with open(output_nbest_file, "w") as f:
f.write(json.dumps(nbest_answers, indent=4) + "\n")
#!/bin/bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
SERVER_URI=${1:-"localhost"}
echo "Waiting for TRITON Server to be ready at http://$SERVER_URI:8000..."
live_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/live"
ready_command="curl -i -m 1 -L -s -o /dev/null -w %{http_code} http://$SERVER_URI:8000/api/health/ready"
current_status=$($live_command)
echo $current_status
# First check the current status. If that passes, check the json. If either fail, loop
while [[ ${current_status} != "200" ]] || [[ $($ready_command) != "200" ]]; do
printf "."
sleep 1
current_status=$($live_command)
done
echo "TRITON Server is ready!"
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.distributed as dist
from pathlib import Path
def get_rank():
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_world_size():
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def is_main_process():
return get_rank() == 0
def barrier():
if dist.is_available() and dist.is_initialized():
dist.barrier()
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Training Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Training Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
return s
def mkdir(path):
Path(path).mkdir(parents=True, exist_ok=True)
def mkdir_by_main_process(path):
if is_main_process():
mkdir(path)
barrier()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment