import numpy as np import os.path import time from transformers import BertTokenizer, BertForMaskedLM, AutoTokenizer from onnxruntime import InferenceSession, SessionOptions, get_all_providers def main(): tokenizer = AutoTokenizer.from_pretrained('./') # context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.' # question = 'What are advantages of ONNX?' contexts = ['今天天气晴', '中国历史有5000年', 'ROCM是AMD的一个软件平台,用来加速GPU计算'] questions = ['今天天气怎么样?', '中国历史有多少年?', 'ROCM用来干什么?'] session = InferenceSession("./model.onnx", providers=[('ROCMExecutionProvider', {'device_id': '4'}),'CPUExecutionProvider']) session_fp16 = InferenceSession("./model_fp16.onnx", providers=[('ROCMExecutionProvider', {'device_id': '4'}),'CPUExecutionProvider']) #获取模型原始输入的字段名称 input_names=[] input_shapes=[] for i in range(len(session.get_inputs())): input_names.append(session.get_inputs()[i].name) input_shapes.append(session.get_inputs()[i].shape) print("input_names:",input_names) print("input_shapes:",input_shapes) #统计模型输出个数 output_names=[] for i in range(len(session.get_outputs())): output_names.append(session.get_outputs()[i].name) print("output_names:",output_names) IOU = 0 n = len(contexts) for i in range(n): inputs = tokenizer(questions[i], contexts[i], padding=True, truncation=False, return_tensors='np') inputs_int64 = {key: np.array(inputs[key], dtype=np.int64) for key in inputs} print("inputs:",tokenizer.decode(inputs.input_ids[0])) input_ids_zeros = np.zeros((1,input_shapes[0][1]),np.int64) input_mask_zeros = np.zeros((1,input_shapes[0][1]),np.int64) segment_ids_zeros = np.zeros((1,input_shapes[0][1]),np.int64) for i in range(len(inputs.input_ids[0])): input_ids_zeros[0][i] = inputs.input_ids[0][i] input_mask_zeros[0][i] = inputs.attention_mask[0][i] segment_ids_zeros[0][i] = inputs.token_type_ids[0][i] onnx_input = {input_names[0]:input_ids_zeros, input_names[1]:input_mask_zeros, input_names[2]:segment_ids_zeros } for i in range(10): t1 = time.perf_counter() outputs = session.run(input_feed=dict(onnx_input), output_names=None) t2 = time.perf_counter() print("fp32 infer time:",i,t2 - t1) # print(outputs) answer_start_index = outputs[0].argmax() answer_end_index = outputs[1].argmax() predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] print("fp32 results:",tokenizer.decode(predict_answer_tokens)) for i in range(10): t1 = time.perf_counter() outputs_fp16 = session_fp16.run(input_feed=dict(onnx_input), output_names=None) t2 = time.perf_counter() print("fp16 infer time:",i,t2 - t1) # print(outputs_fp16) answer_start_index_fp16 = outputs_fp16[0].argmax() answer_end_index_fp16 = outputs_fp16[1].argmax() predict_answer_tokens_fp16 = inputs.input_ids[0, answer_start_index_fp16 : answer_end_index_fp16 + 1] print("fp16 results:",tokenizer.decode(predict_answer_tokens_fp16)) answer_start_index_max = answer_start_index if answer_start_index > answer_start_index_fp16 else answer_start_index_fp16 answer_end_index_min = answer_end_index if answer_end_index < answer_end_index_fp16 else answer_end_index_fp16 answer_start_index_min = answer_start_index if answer_start_index < answer_start_index_fp16 else answer_start_index_fp16 answer_end_index_max = answer_end_index if answer_end_index > answer_end_index_fp16 else answer_end_index_fp16 iou = (answer_end_index_min - answer_start_index_max + 1) / (answer_end_index_max - answer_start_index_min + 1) print("result iou of fp16/fp32:",iou) IOU += iou print("average iou:", IOU / n) if __name__ == "__main__": main()