"tests/vscode:/vscode.git/clone" did not exist on "916d375ba3d62e018231633ca74e33ce128085c3"
Commit e206b112 authored by sugon_cxj's avatar sugon_cxj
Browse files

first commit

parent 22066d76
Pipeline #472 canceled with stages
# bert_large_squad_onnx # bert_large_squad_onnx
## 模型介绍
bert-large的squad模型。
## 模型结构
基于transformer的结构
## 推理
### 环境配置
[光源](https://www.sourcefind.cn/#/service-details)可拉取推理的docker镜像,在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)可下载onnxruntime安装包。bert_large_squad_onnx推荐的镜像如下:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:ort1.14.0_migraphx3.0.0-dtk22.10.1
```
[huggingface](https://huggingface.co/ctuning/mlperf-inference-bert-onnx-fp32-squad-v1.1)下载模型model.onnx到当前目录
执行fp16转换
```
python3 fp16-convert.py
```
### 推理
```
python3 main.py
```
## 性能数据
fp32
| loop | time(ms) |
| :------: | :------: |
| 1 | 0.09298863005824387 |
| 2 | 0.04267867305316031 |
| 3 | 0.04294574190862477 |
| 4 | 0.042622152948752046 |
| 5 | 0.042897791834548116 |
| 6 | 0.04309680196456611 |
| 7 | 0.04240077408030629 |
| 8 | 0.042515473905950785 |
| 9 | 0.0424974428024143 |
| 10 | 0.04259936395101249 |
fp16
| loop | time(ms) |
| :------: | :------: |
| 1 | 0.059390615904703736 |
| 2 | 0.04876187210902572 |
| 3 | 0.04870052193291485 |
| 4 | 0.04873379203490913 |
| 5 | 0.04842417314648628 |
| 6 | 0.04876326210796833 |
| 7 | 0.04846481396816671 |
| 8 | 0.04872900294139981 |
| 9 | 0.048555332934483886 |
| 10 | 0.048343464033678174 |
## 源码仓库及问题反馈
https://developer.hpccube.com/codes/modelzoo/bert_large_squad_onnx
## 参考
https://github.com/google-research/bert
{
"_name_or_path": "yechen/bert-large-chinese",
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.12.5",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
import os
import onnxmltools
from onnxmltools.utils.float16_converter import convert_float_to_float16
input_onnx_model = "./model.onnx"
output_onnx_model = "./model_fp16.onnx"
print(input_onnx_model)
print(output_onnx_model)
# Load your model
onnx_model = onnxmltools.utils.load_model(input_onnx_model)
onnx_model = convert_float_to_float16(onnx_model)
onnxmltools.utils.save_model(onnx_model, output_onnx_model)
import numpy as np
import os.path
import time
from transformers import BertTokenizer, BertForMaskedLM, AutoTokenizer
from onnxruntime import InferenceSession, SessionOptions, get_all_providers
def main():
tokenizer = AutoTokenizer.from_pretrained('./')
# context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'
# question = 'What are advantages of ONNX?'
# context = '今天天气晴'
# question = '今天天气怎么样?'
# context = '中国历史有5000年'
# question = '中国历史有多少年?'
context = 'ROCM是AMD的一个软件平台,用来加速GPU计算'
question = 'ROCM用来干什么?'
session = InferenceSession("./model.onnx", providers=[('ROCMExecutionProvider', {'device_id': '4'}),'CPUExecutionProvider'])
session_fp16 = InferenceSession("./model_fp16.onnx", providers=[('ROCMExecutionProvider', {'device_id': '4'}),'CPUExecutionProvider'])
#获取模型原始输入的字段名称
input_names=[]
input_shapes=[]
for i in range(len(session.get_inputs())):
input_names.append(session.get_inputs()[i].name)
input_shapes.append(session.get_inputs()[i].shape)
print("input_names:",input_names)
print("input_shapes:",input_shapes)
#统计模型输出个数
output_names=[]
for i in range(len(session.get_outputs())):
output_names.append(session.get_outputs()[i].name)
print("output_names:",output_names)
inputs = tokenizer(question, context, padding=True, truncation=False, return_tensors='np')
inputs_int64 = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
print("inputs:",tokenizer.decode(inputs.input_ids[0]))
input_ids_zeros = np.zeros((1,384),np.int64)
input_mask_zeros = np.zeros((1,384),np.int64)
segment_ids_zeros = np.zeros((1,384),np.int64)
for i in range(len(inputs.input_ids[0])):
input_ids_zeros[0][i] = inputs.input_ids[0][i]
input_mask_zeros[0][i] = inputs.attention_mask[0][i]
segment_ids_zeros[0][i] = inputs.token_type_ids[0][i]
onnx_input = {input_names[0]:input_ids_zeros,
input_names[1]:input_mask_zeros,
input_names[2]:segment_ids_zeros }
for i in range(10):
t1 = time.perf_counter()
outputs = session.run(input_feed=dict(onnx_input), output_names=None)
t2 = time.perf_counter()
print("fp32:",i,t2 - t1)
answer_start_index = outputs[0].argmax()
answer_end_index = outputs[1].argmax()
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
print("results fp32:",tokenizer.decode(predict_answer_tokens))
for i in range(10):
t1 = time.perf_counter()
outputs_fp16 = session_fp16.run(input_feed=dict(onnx_input), output_names=None)
t2 = time.perf_counter()
print("fp16:",i,t2 - t1)
answer_start_index_fp16 = outputs_fp16[0].argmax()
answer_end_index_fp16 = outputs_fp16[1].argmax()
predict_answer_tokens_fp16 = inputs.input_ids[0, answer_start_index_fp16 : answer_end_index_fp16 + 1]
print("results fp16:",tokenizer.decode(predict_answer_tokens_fp16))
if __name__ == "__main__":
main()
# 模型名称
modelName=bert_large_squad_onnx
# 模型描述
modelDescription=bert_large_squad_onnx是基于bert-large的squad downstream任务的onnx模型
# 应用场景
appScenario=推理,NLP
# 框架类型
frameType=onnxruntime
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment