#!/usr/bin/python # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import numpy as np import os import sys from builtins import range import collections from tqdm import tqdm import time from tensorrtserver.api import * sys.path.append('.') from run_squad import get_answers, convert_examples_to_features, read_squad_examples from tokenization import BertTokenizer import json import pickle args = None if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action="store_true", required=False, default=False, help='Enable verbose output') parser.add_argument('-u', '--url', type=str, required=False, default='localhost:8000', help='Inference server URL. Default is localhost:8000.') parser.add_argument('-i', '--protocol', type=str, required=False, default='http', help='Protocol ("http"/"grpc") used to ' + 'communicate with inference service. Default is "http".') parser.add_argument('-H', dest='http_headers', metavar="HTTP_HEADER", required=False, action='append', help='HTTP headers to add to inference server requests. ' + 'Format is -H"Header:Value".') parser.add_argument('--synchronous', action='store_true', help="Wait for previous request to finish before sending next request.") parser.add_argument("--model_name", type=str, default='bert', help="Specify model to run") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. \ True for uncased models, False for cased models.") parser.add_argument('--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--batch_size', default=1, type=int, help='Maximal number of examples in a batch') parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") args = parser.parse_args() # TRITON client setup protocol = ProtocolType.from_str(args.protocol) model_version = -1 infer_ctx = InferContext(args.url, protocol, args.model_name, model_version, http_headers=args.http_headers, verbose=args.verbose) # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) dtype = np.int64 def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): unique_ids = () example_indices = () input_ids_data = () input_mask_data = () segment_ids_data = () for i in range(0, min(n, l-ndx)): unique_ids = unique_ids + (iterable[ndx + i].unique_id,) example_indices = example_indices + (ndx + i,) input_ids_data = input_ids_data + (np.array(iterable[ndx + i].input_ids, dtype=dtype),) input_mask_data = input_mask_data + (np.array(iterable[ndx + i].input_mask, dtype=dtype),) segment_ids_data = segment_ids_data + (np.array(iterable[ndx + i].segment_ids, dtype=dtype),) inputs_dict = {'input__0': input_ids_data, 'input__1': segment_ids_data, 'input__2': input_mask_data} yield inputs_dict, example_indices, unique_ids RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) ExampleInfo = collections.namedtuple("ExampleInfo", ["start_time", "batch_size", "example_ids", "unique_ids"]) all_results = [] time_list = [] outstanding = 0 sent_prog = tqdm(desc="Sending Requests", total=len(eval_features), file=sys.stdout, unit='sentences') recv_prog = tqdm(desc="Processed Requests", total=len(eval_features), file=sys.stdout, unit='sentences') if args.synchronous: raw_results = [] def process_result_cb(example_info, ctx, request_id): global outstanding result = infer_ctx.get_async_run_results(request_id) stop = time.time() outstanding -= 1 time_list.append(stop - example_info.start_time) batch_count = example_info.batch_size for i in range(batch_count): unique_id = int(example_info.unique_ids[i]) start_logits = [float(x) for x in result["output__0"][i].flat] end_logits = [float(x) for x in result["output__1"][i].flat] all_results.append( RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=batch_count) all_results_start = time.time() for input_dict, example_indices, unique_ids in batch(eval_features, args.batch_size): current_bs = len(input_dict['input__0']) outputs_dict = {'output__0': InferContext.ResultFormat.RAW, 'output__1': InferContext.ResultFormat.RAW} start = time.time() example_info = ExampleInfo(start_time=start, batch_size=current_bs, example_ids=example_indices, unique_ids=unique_ids ) if not args.synchronous: outstanding += 1 result_id = infer_ctx.async_run(partial(process_result_cb, example_info), input_dict, outputs_dict, batch_size=current_bs) else: result = infer_ctx.run(input_dict, outputs_dict, batch_size=current_bs) raw_results.append((example_info, result)) sent_prog.update(n=current_bs) # Make sure that all sent requests have been processed while outstanding > 0: pass all_results_end = time.time() all_results_total = (all_results_end - all_results_start) * 1000.0 num_batches = (len(eval_features) + args.batch_size - 1) // args.batch_size if args.synchronous: for result in raw_results: example_info, batch = result for i in range(example_info.batch_size): unique_id = int(example_info.unique_ids[i]) start_logits = [float(x) for x in batch["output__0"][i].flat] end_logits = [float(x) for x in batch["output__1"][i].flat] all_results.append( RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) recv_prog.update(n=example_info.batch_size) print("-----------------------------") print("Individual Time Runs") print("Total Time: {} ms".format(all_results_total)) print("-----------------------------") print("-----------------------------") print("Total Inference Time = %0.2f for" "Sentences processed = %d" % (sum(time_list), len(eval_features))) print("Throughput Average (sentences/sec) = %0.2f" % (len(eval_features) / all_results_total * 1000.0)) print("Throughput Average (batches/sec) = %0.2f" % (num_batches / all_results_total * 1000.0)) print("-----------------------------") if not args.synchronous: time_list.sort() avg = np.mean(time_list) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) print("-----------------------------") print("Summary Statistics") print("Batch size =", args.batch_size) print("Sequence Length =", args.max_seq_length) print("Latency Confidence Level 95 (ms) =", cf_95 * 1000) print("Latency Confidence Level 99 (ms) =", cf_99 * 1000) print("Latency Confidence Level 100 (ms) =", cf_100 * 1000) print("Latency Average (ms) =", avg * 1000) print("-----------------------------") output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") answers, nbest_answers = get_answers(eval_examples, eval_features, all_results, args) with open(output_prediction_file, "w") as f: f.write(json.dumps(answers, indent=4) + "\n") with open(output_nbest_file, "w") as f: f.write(json.dumps(nbest_answers, indent=4) + "\n")