Commit 24b257f1 authored by sunzhq2's avatar sunzhq2
Browse files

init

parent 920b3c0f
{
"model": "albert-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/albert.onnx",
"model_path": "general_perf/download/moffett/converted_models/albert-mf-int8/albert-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/albert-base-v2_squad.npy",
"transform_file": "",
"batch_size": 12,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
{
"model": "bert-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/bert-base.onnx",
"model_path": "general_perf/download/moffett/converted_models/bert-mf-int8/bert-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/bert-base-uncased_squad.npy",
"transform_file": "",
"batch_size": 12,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
{
"model": "conformer-encoder-onnx-fp32",
"onnx_path": "general_perf/download/moffett/models/conformer.onnx",
"model_path": "general_perf/download/moffett/converted_models/conformer-mf-int8/conformer-mf-int8.zip",
"calibration_dir": " ",
"transform_file": "",
"batch_size": 4,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "none",
"framework": "sparsert"
}
{
"model": "resnet50-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/resnet50.onnx",
"model_path": "general_perf/download/moffett/converted_models/resnet50-mf-int8/resnet50-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/imgnet_calibrate_data",
"transform_file": "general_perf/download/moffett/compiler_wrapper/resnet50-mf-int8/mxnet_imagenet_trans_224.json",
"batch_size": 4,
"verify": false,
"model_precision": "INT8",
"dataset_name": "open_imagenet",
"framework": "sparsert"
}
{
"model": "roberta-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/roberta.onnx",
"model_path": "general_perf/download/moffett/converted_models/roberta-mf-int8/roberta-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/roberta-base_squad.npy",
"transform_file": "",
"batch_size": 4,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
import os
import json
import logging
import torch._tensor
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import migraphx
import tensorflow as tf
import torch
import onnxruntime
import time
import numpy as np
import onnx
from onnx import shape_inference
from general_perf.backends import runtime_backend
log = logging.getLogger("BackendDCU")
pt_dtype_map = {
"FLOAT32": torch.float32,
"FLOAT16": torch.float16,
"INT8": torch.int8,
"LONG": torch.long
}
INPUT_TYPE = {
"INT8": np.int8,
"UINT8": np.uint8,
"FLOAT32": np.float32,
"FLOAT16": np.float16,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64,
"BOOL": np.bool
}
class RuntimeBackendDCU(runtime_backend.RuntimeBackend):
def __init__(self):
super(RuntimeBackendDCU, self).__init__()
self.hardware_type = 'DCU'
self.need_reload = False
self.model_runtimes = []
self.configs = None
self.batch_size = -1
def predict(self, feeds):
results = {}
if self.framework == "Tensorflow":
entry_rt = self.model_runtimes[0].signatures['serving_default']
all_sn_inputs = entry_rt.structured_input_signature
def get_real_feeds(feeds, sn_inputs):
sn_inputs = tf.nest.flatten(sn_inputs, True)
real_feeds = {}
itr = 0
for _, val in feeds.items():
real_feeds[sn_inputs[itr].name] = tf.constant(val)
itr += 1
return real_feeds
real_feeds = get_real_feeds(feeds, all_sn_inputs)
start_time = time.time()
for model_runtime in self.model_runtimes:
with tf.device('GPU'):
_results = model_runtime.signatures['serving_default'](
**real_feeds)
end_time = time.time()
use_time = end_time - start_time
results = {}
for key, val in _results.items():
results[key] = val.numpy()
assert len(results) != 0
elif self.framework == "Pytorch":
input_tensors = []
new_input_type = self.input_type.split(',')
i = 0
for key, _ in feeds.items():
input_tensors.append(
torch.tensor(feeds[key],
dtype=pt_dtype_map[new_input_type[i]]).to(
self.device))
i += 1
start_time = time.time()
if self.configs['compile_precision'] == "FP16" and self.configs['model'].find("bert") != -1:
with torch.no_grad(), torch.cuda.amp.autocast():
for model_runtime in self.model_runtimes:
results = model_runtime(*input_tensors)
else:
with torch.no_grad():
for model_runtime in self.model_runtimes:
results = model_runtime(*input_tensors)
end_time = time.time()
use_time = end_time - start_time
if isinstance(results, dict):
for key, val in results.items():
results[key] = val.cpu().detach().numpy()
elif isinstance(results, tuple):
dic = {}
for i, key in enumerate(self.outputs):
dic[key] = list(results)[i]
else:
results = {self.outputs[0]: results.cpu().numpy()}
elif self.framework == "Migraphx":
for model_runtime in self.model_runtimes:
modelData = self.AllocateteOutputMemory(model_runtime)
for key, _ in feeds.items():
feeds[key] = np.array(feeds[key])
modelData[key] = migraphx.to_gpu(migraphx.argument(feeds[key]))
start_time = time.time()
results_migraphx = model_runtime.run(modelData)
end_time = time.time()
use_time = end_time - start_time
results = []
for i in range(len(results_migraphx)):
result = np.array(results_migraphx[i])
results.append(result)
else:
for model_runtime in self.model_runtimes:
start_time = time.time()
results = model_runtime.run(None, feeds)
end_time = time.time()
use_time = end_time - start_time
return use_time, results
# return results
def benchmark(self, dataloader):
iterations = self.workload['iterations']
batch_size = self.get_loaded_batch_size()
times_range = []
time_range = []
report = {}
report['BS'] = batch_size
test_data = self._get_fake_samples(
batch_size, self.configs['segments'][0]['input_tensor_map'],
self.configs['input_type'])
for _ in range(30):
self.predict(test_data)
for _ in range(iterations):
start_time = time.time()
use_time,_ = self.predict(test_data)
end_time = time.time()
times_range.append(use_time)
time_range.append(batch_size / use_time)
# times_range.append(end_time - start_time)
times_range.sort()
tail_latency = round(
times_range[int(len(times_range) * 0.99)] * 1000, 2)
avg_latency = round(sum(times_range) / iterations * 1000, 2)
qps = int(1000.0 * batch_size / avg_latency)
log.info(
'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
format(batch_size, qps, avg_latency, tail_latency))
report['QPS'] = qps
report['AVG Latency'] = avg_latency
report['P99 Latency'] = tail_latency
return report
def get_loaded_batch_size(self):
return self.batch_size
def load(self, batch_size) -> None:
self.batch_size = batch_size
self.model_runtimes = []
self.input_type = self.configs['input_type']
self.framework = self.configs['framework']
self.model_name = self.configs['model']
for i, segment in enumerate(self.configs['segments']):
# there is no input/output meta data i the graph so it need to come from config.
if not segment['input_tensor_map']:
raise ValueError("Segment " + str(i) + " needs inputs")
if not segment['output_tensor_map']:
raise ValueError("Segment " + str(i) + " needs outputs")
self.input_shapes = segment['input_tensor_map']
self.outputs = segment['output_tensor_map'].split(",")
if self.framework == "Tensorflow":
'''
判断需要的模型精度,并对之进行相应的转换操作
'''
if self.configs['compile_precision'] == "FP16":
with tf.device('GPU'):
model = tf.saved_model.load(
segment['compiled_model'][0]['compiled_obj'])
for var in model.variables:
var.assign(tf.cast(var,tf.float16))
if self.configs['compile_precision'] == "INT8":
with tf.device('GPU'):
model = tf.saved_model.load(
segment['compiled_model'][0]['compiled_obj'])
for var in model.variables:
var.assign(tf.cast(var,tf.int8))
if self.configs['compile_precision'] == "FP32":
with tf.device('GPU'):
model = tf.saved_model.load(
segment['compiled_model'][0]['compiled_obj'])
elif self.framework == "Pytorch":
self.device = "cuda"
'''
判断模型是否为bert,如果是需要禁用torch.jit.fuser
'''
if self.configs['model'].find("bert") != -1:
torch._C._jit_set_texpr_fuser_enabled(False)
model = torch.jit.load(
segment['compiled_model'][0]['compiled_obj'],
torch.device('cuda'))
if self.configs['compile_precision'] == "FP16":
if self.configs['model'].find("bert") != -1:
scaler = torch.cuda.amp.GradScaler()
model = model.half()
model.eval()
elif self.framework == "Migraphx":
self.device = "cuda"
if self.configs['model'] == 'bert-migraphx-fp16':
model = migraphx.load(segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.mrx')
else:
model = migraphx.parse_onnx(segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.onnx')
if self.configs['compile_precision'] == "INT8":
print("=======================INT8====================")
dic = dict()
fake_data = self._get_fake_samples(batch_size, self.configs['segments'][0]['input_tensor_map'], self.configs['input_type'])
for key,_ in fake_data.items():
dic[key] = migraphx.argument(fake_data[key])
calibration = [dic]
migraphx.quantize_int8(model, migraphx.get_target("gpu"), calibration)
model.compile(migraphx.get_target("gpu"),offload_copy=False,device_id=0)
else:
enable_tag = 'false'
if self.configs['compile_precision'] == 'FP16':
enable_tag = 'true'
if self.configs['model'].find("resnet50") != -1:
if self.configs['compile_precision'] == 'INT8':
providers = ['ROCMExecutionProvider']
else:
providers = ['MIGraphXExecutionProvider']
# provider_options=[{'device_id': '0','migraphx_fp16_enable':enable_tag,'dynamic_model':'true','migraphx_profile_max_shapes':'input_1.1:256x3x224x224'}]
provider_options=[{'device_id': '0'}]
else:
providers=['ROCMExecutionProvider']
# provider_options=None
provider_options=[{'device_id': '0'}]
# model = onnxruntime.InferenceSession(
# segment['compiled_model'][0]['compiled_obj'],
# providers=providers,provider_options=provider_options)
model = onnxruntime.InferenceSession(
segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.onnx',
providers=providers,provider_options=provider_options)
self.model_runtimes.append(model)
def _get_fake_samples(self, batch_size, shape, input_type):
data = {}
if input_type:
i = 0
new_input_type = self.input_type.split(',')
for key, val in shape.items():
if key != "text":
val = [val[0] * batch_size] + val[1:]
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[new_input_type[i]])
else:
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[new_input_type[i]])
i += 1
return data
else:
raise ValueError("Please provide input type")
def AllocateteOutputMemory(self, model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
def GetMIGraphXType(self, type):
typeMap = {
'double_type': np.float64,
'float_type': np.float32,
'half_type': np.half,
'int64_type': np.int64,
'uint64_type': np.uint64,
'int32_type': np.int32,
'uint32_type': np.uint32,
'int16_type': np.int16,
'uint16_type': np.uint16,
'int8_type': np.int8,
'uint8_type': np.uint8,
'bool_type': bool
}
return typeMap[type]
import os
import json
import logging
import torch
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import torch
import onnxruntime
import time
import numpy as np
from general_perf.backends import runtime_backend
log = logging.getLogger("BackendDCU")
pt_dtype_map = {
"FLOAT32": torch.float32,
"FLOAT16": torch.float16,
"INT8": torch.int8,
"LONG": torch.long
}
INPUT_TYPE = {
"INT8": np.int8,
"UINT8": np.uint8,
"FLOAT32": np.float32,
"FLOAT16": np.float16,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64,
"BOOL": np.bool
}
class RuntimeBackendDCU(runtime_backend.RuntimeBackend):
def __init__(self):
super(RuntimeBackendDCU, self).__init__()
self.hardware_type = 'DCU'
self.need_reload = False
self.model_runtimes = []
self.configs = None
self.batch_size = -1
def predict(self, feeds):
results = {}
if self.framework == "Tensorflow":
entry_rt = self.model_runtimes[0].signatures['serving_default']
all_sn_inputs = entry_rt.structured_input_signature
def get_real_feeds(feeds, sn_inputs):
sn_inputs = tf.nest.flatten(sn_inputs, True)
real_feeds = {}
itr = 0
for _, val in feeds.items():
real_feeds[sn_inputs[itr].name] = tf.constant(val)
itr += 1
return real_feeds
real_feeds = get_real_feeds(feeds, all_sn_inputs)
for model_runtime in self.model_runtimes:
with tf.device('GPU'):
_results = model_runtime.signatures['serving_default'](
**real_feeds)
results = {}
for key, val in _results.items():
results[key] = val.numpy()
assert len(results) != 0
elif self.framework == "Pytorch":
input_tensors = []
new_input_type = self.input_type.split(',')
i = 0
for key, _ in feeds.items():
input_tensors.append(
torch.tensor(feeds[key],
dtype=pt_dtype_map[new_input_type[i]]).to(
self.device))
i += 1
if self.configs["model"] == "bert-torch-fp16":
with torch.cuda.amp.autocast():
with torch.no_grad():
for model_runtime in self.model_runtimes:
results = model_runtime(*input_tensors)
else:
with torch.no_grad():
for model_runtime in self.model_runtimes:
results = model_runtime(*input_tensors)
if isinstance(results, dict):
for key, val in results.items():
results[key] = val.cpu().detach().numpy()
elif isinstance(results, tuple):
dic = {}
for i, key in enumerate(self.outputs):
dic[key] = list(results)[i]
else:
results = {self.outputs[0]: results.cpu().numpy()}
else:
for model_runtime in self.model_runtimes:
if self.configs["model"] == "resnet50-onnxruntime-fp16":
feeds["input_1.1"] = feeds["input_1.1"].astype("float16")
results = model_runtime.run(None, feeds)
return results
def benchmark(self, dataloader):
iterations = self.workload['iterations']
batch_size = self.get_loaded_batch_size()
times_range = []
report = {}
report['BS'] = batch_size
test_data = self._get_fake_samples(
batch_size, self.configs['segments'][0]['input_tensor_map'],
self.configs['input_type'])
for _ in range(30):
self.predict(test_data)
for _ in range(iterations):
start_time = time.time()
self.predict(test_data)
end_time = time.time()
times_range.append(end_time - start_time)
times_range.sort()
tail_latency = round(
times_range[int(len(times_range) * 0.99)] * 1000, 2)
avg_latency = round(sum(times_range) / iterations * 1000, 2)
qps = int(1000.0 * batch_size / avg_latency)
log.info(
'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
format(batch_size, qps, avg_latency, tail_latency))
report['QPS'] = qps
report['AVG Latency'] = avg_latency
report['P99 Latency'] = tail_latency
return report
def get_loaded_batch_size(self):
return self.batch_size
def load(self, batch_size) -> None:
self.batch_size = batch_size
self.model_runtimes = []
self.input_type = self.configs['input_type']
self.framework = self.configs['framework']
self.model_name = self.configs['model']
for i, segment in enumerate(self.configs['segments']):
# there is no input/output meta data i the graph so it need to come from config.
if not segment['input_tensor_map']:
raise ValueError("Segment " + str(i) + " needs inputs")
if not segment['output_tensor_map']:
raise ValueError("Segment " + str(i) + " needs outputs")
self.input_shapes = segment['input_tensor_map']
self.outputs = segment['output_tensor_map'].split(",")
if self.framework == "Tensorflow":
with tf.device('GPU'):
model = tf.saved_model.load(
segment['compiled_model'][0]['compiled_obj'])
if self.configs['compile_precision'] == "FP16":
# 将所有变量转换为 float16
for var in model.variables:
var.assign(tf.cast(var,tf.float16))
elif self.framework == "Pytorch":
self.device = "cuda"
if self.configs["model"].split("-")[0] == "bert" or self.configs["model"].split("-")[0] == "roberta":
# torch.jit.fuser('off')
torch._C._jit_override_can_fuse_on_cpu(False)
torch._C._jit_override_can_fuse_on_gpu(False)
# torch._C._jit_set_texpr_fuser_enabled(False)
# torch._C._jit_set_nvfuser_enabled(False)
# https://github.com/pytorch/pytorch/issues/62962
model = torch.jit.load(
segment['compiled_model'][0]['compiled_obj'],
torch.device('cuda'))
if self.configs['compile_precision'] == "FP16":
model = model.half()
model.eval()
else:
# import pdb
# pdb.set_trace()
providers = [
('ROCMExecutionProvider', {
'device_id': 0,
'arena_extend_strategy': 'kNextPowerOfTwo',
# 'cudnn_conv_algo_search': 'EXHAUSTIVE',
'do_copy_in_default_stream': True,
}),
]
# # 启用 FP16
options = onnxruntime.SessionOptions()
# options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
# options.intra_op_num_threads = 1
# options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
# options.enable_cuda_graph = True # 如果你的硬件支持 CUDA Graph
# options.add_session_config_entry("session.set_denormal_as_zero", "1")
if self.configs['compile_precision'] == "FP16":
options.add_session_config_entry("session.enable_fp16", "1") # 启用 FP16
model = onnxruntime.InferenceSession(
segment['compiled_model'][0]['compiled_obj'],
providers=providers,
sess_options=options)
self.model_runtimes.append(model)
def _get_fake_samples(self, batch_size, shape, input_type):
data = {}
if input_type:
i = 0
new_input_type = self.input_type.split(',')
for key, val in shape.items():
if key != "text":
val = [val[0] * batch_size] + val[1:]
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[new_input_type[i]])
else:
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[new_input_type[i]])
i += 1
return data
else:
raise ValueError("Please provide input type")
<div align="center">
<img src="habana-white_intel_logo.png">
</div>
<!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} -->
<!-- code_chunk_output -->
- [Habana](#habana)
- [Product Specs](#product-specs)
- [Models supported](#models-supported)
- [How to run](#how-to-run)
- [1. Create docker container](#1-create-docker-container)
- [2. Environment initialization](#2-environment-initialization)
- [3. Device basic information verification](#3-device-basic-information-verification)
- [4.Run byte-mlperf task](#4run-byte-mlperf-task)
<!-- /code_chunk_output -->
# Habana
As enterprises and organizations look to seize the growing advantages of AI, the time has never been better for AI compute that’s faster yet efficient. Efficient on cost, power, and your time and resources. That’s why you’ll want to give Habana Gaudi processors a try.The Gaudi acceleration platform was conceived and architected to address training and inference demands of large-scale era AI, providing enterprises and organizations with high-performance, high-efficiency deep learning compute.
## Product Specs
- Gaudi
With Habana’s first-generation Gaudi deep learning processor, customers benefit from the most cost-effective, high-performance training and inference alternative to comparable GPUs. This is the deep learning architecture that enables AWS to deliver up to 40% better price/performance training with its Gaudi-based DL1 instances—as compared to comparable Nvidia GPU-based instances. Gaudi’s efficient architecture also enables Supermicro to provide customers with equally significant price performance advantage over GPU-based servers with the Supermicro X12 Gaudi Training Server.
<div align="center">
<img src="gaudi.png">
</div>
- Gaudi2
Our Gaudi2 accelerator is driving improved deep learning price-performance
and operational efficiency for training and running state-of-the-art models, from the largest language and multi-modal models to more basic computer vision and NLP models. Designed for efficient scalability—whether in the cloud or in your data center, Gaudi2 brings the AI industry the choice it needs—now more than ever.
<div align="center">
<img src="gaudi2.png">
</div>
# Models supported
| Model name | Precision | QPS | Dataset | Metric name | Metric value | report |
| ---- | ---- | ---- | ---- | ---- | ---- | ---- |
| bert-torch-fp32 | BF16 | 1970 | Open Squad 1.1 | F1 Score | 85.8827 | [report](../../reports/HPU/bert-torch-fp32/) |
| albert-torch-fp32 | BF16 | 2030 | Open Squad 1.1 | F1 Score | 87.66915 | [report](../../reports/HPU/albert-torch-fp32/) |
| deberta-torch-fp32 | BF16 | 1970 | Open Squad 1.1 | F1 Score | 81.33603 | [report](../../reports/HPU/deberta-torch-fp32/) |
| resnet50-torch-fp32 | BF16 | 8279 | Open ImageNet | Top-1 | 0.7674 | [report](../../reports/HPU/resnet50-torch-fp32/) |
| swin-large-torch-fp32 | BF16 |341 | Open ImageNet | Top-1 | 0.855 | [report](../../reports/HPU/swin-large-torch-fp32/) |
# How to run
### 1. Create docker container
```bash
docker run -itd --name test --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
```
### 2. Environment initialization
Environment initialization please operate in the container.
```bash=
docker exec -it test /bin/bash
```
### 3. Device basic information verification
hl-smi is a command line utility that can view various information of Gaudi, such as card number, usage, temperature, power consumption, etc.
After the driver is successfully installed, execute hl-smi to view the basic information of the device.
```bash
hl-smi
```
### 4.Run byte-mlperf task
For example,
```bash
python launch.py --task bert-torch-fp32 --hardware_type HPU
```
For more information of the command to run the task, please refer to [ByteMLPerf](../../../README.md#usage).
add
addmm
bmm
dropout
gelu
iadd
linear
matmul
mm
softmax
embedding
cross_entropy
nll_loss
log_softmax
truediv
div
layer_norm
rsub
import os
import json
import logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import torch
import time
import numpy as np
from general_perf.backends import compile_backend
log = logging.getLogger("CompileBackendHPU")
pt_dtype_map = {
"FLOAT32": torch.float32,
"FLOAT16": torch.float16,
"INT8": torch.int8,
"LONG": torch.long
}
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64
}
class CompileBackendHPU(compile_backend.CompileBackend):
def __init__(self):
super(CompileBackendHPU, self).__init__()
self.hardware_type = 'HPU'
self.need_reload = False
self.model_runtimes = []
def _update_model_env(self):
if self.model_info["model"] in ("bert-torch-fp32", "albert-torch-fp32"):
os.environ['LOWER_LIST'] ='general_perf/backends/HPU/bert/bf16.txt'
os.environ['FP32_LIST'] ='general_perf/backends/HPU/bert/fp32.txt'
def compile(self, config, dataloader=None):
result = {
"model": config['model_info']['model'],
"framework": config['model_info']['framework'],
"compile_precision": "BF16",
"optimizations":{},
"instance_count": 1,
"device_count": 1,
"input_type": config['model_info']['input_type'].split(","),
"max_batch_size": config['model_info']['max_batch_size'],
"compile_status": "success",
"sg_percent": 100,
"segments": [
{
"sg_idx":
0,
"is_fallback": False,
"input_tensor_map": config['model_info']['input_shape'],
"output_tensor_map": config['model_info']['outputs'],
"compiled_model": [
{
"compiled_bs": 1,
"compiled_obj": config['model_info']['model_path'],
},
],
},
]
}
self.configs = result
self.workload = config['workload']
self.model_info = config['model_info']
self._update_model_env()
return result
def get_interact_profile(self, config):
model_profile = []
file_path = "general_perf/backends/HPU/" + self.hardware_type + '.json'
if os.path.exists(file_path):
with open(file_path, 'r') as f:
model_profile = json.load(f)
else:
log.info(
'File path: {} does not exist, please check'.format(file_path))
return model_profile
def get_best_batch_size(self):
"""
Get Best Batch Size for the model
"""
return None
transformers
tokenization
torchvision
numpy
tensorflow
bert
bert-tensorflow==1.0.1
sentencepiece
import os
import json
import logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import torch
import time
import numpy as np
from threading import Thread
from general_perf.backends import runtime_backend
log = logging.getLogger("BackendHPU")
pt_dtype_map = {
"FLOAT32": torch.float32,
"INT8": torch.int8,
"LONG": torch.long
}
INPUT_TYPE = {
"UINT8": np.uint8,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64,
"BOOL": np.bool
}
class RuntimeBackendHPU(runtime_backend.RuntimeBackend):
def __init__(self):
super(RuntimeBackendHPU, self).__init__()
self.hardware_type = 'HPU'
self.need_reload = False
self.model_runtimes = []
self.configs = None
self.batch_size = -1
def predict(self, feeds):
results = {}
if self.framework == "Pytorch":
input_tensors = []
i = 0
for key, _ in feeds.items():
if self.input_type[i] == "FLOAT32":
datatype = torch.bfloat16
else:
datatype = pt_dtype_map[self.input_type[i]]
input_tensors.append(
torch.tensor(feeds[key],
dtype=datatype).to(
self.device,non_blocking=True))
i += 1
import habana_frameworks.torch.core as htcore
with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
for model_runtime in self.model_runtimes:
results = model_runtime(*input_tensors)
htcore.mark_step()
htcore.hpu.default_stream().synchronize()
if isinstance(results, dict):
for key, val in results.items():
results[key] = val.float().cpu().detach().numpy() if val.dtype==torch.bfloat16 else val.cpu().detach().numpy()
elif isinstance(results, tuple):
dic = {}
for i, key in enumerate(self.outputs):
dic[key] = list(results)[i]
else:
results = {self.outputs[0]: results.float().cpu().numpy() if results.dtype==torch.bfloat16 else results.cpu().numpy()}
else:
print("Just test pytorch for now.")
return results
def benchmark(self, dataloader):
iterations = self.workload['iterations']
batch_size = self.get_loaded_batch_size()
times_range = []
report = {}
report['BS'] = batch_size
test_data = self._get_fake_samples(
batch_size, self.configs['segments'][0]['input_tensor_map'],
self.configs['input_type'])
enable_profile = False
if enable_profile:
warmup_steps = 2
active_steps = 5
import habana_frameworks.torch.core as htcore
prof = torch.profiler.profile(
activities=(torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU),
schedule=torch.profiler.schedule(wait=0, warmup=warmup_steps, active=active_steps, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler('./profile/'),
record_shapes=False,
with_stack=True)
for _ in range(30):
self.predict(test_data)
if enable_profile:
prof.start()
for _ in range(iterations):
start_time = time.time()
self.predict(test_data)
end_time = time.time()
times_range.append(end_time - start_time)
if enable_profile:
prof.step()
if enable_profile:
prof.stop()
times_range.sort()
tail_latency = round(
times_range[int(len(times_range) * 0.99)] * 1000, 2)
avg_latency = round(sum(times_range) / iterations * 1000, 2)
qps = int(1000.0 * batch_size / avg_latency)
# start_time = time.time()
# threads = []
# for i in range(iterations):
# with torch.hpu.stream(torch.hpu.Stream()):
# threads.append(Thread(target=self.predict, args=(test_data,)))
# threads[i].start()
# for t in threads:
# t.join()
# end_time = time.time()
# qps = int(1000.0 * batch_size * iterations / (end_time-start_time))
log.info(
'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
format(batch_size, qps, avg_latency, tail_latency))
report['QPS'] = qps
report['AVG Latency'] = avg_latency
report['P99 Latency'] = tail_latency
return report
def get_loaded_batch_size(self):
return self.batch_size
def load(self, batch_size) -> None:
self.batch_size = batch_size
self.model_runtimes = []
self.input_type = self.configs['input_type']
self.framework = self.configs['framework']
self.model_name = self.configs['model']
import habana_frameworks.torch.core as htcore
for i, segment in enumerate(self.configs['segments']):
# there is no input/output meta data i the graph so it need to come from config.
if not segment['input_tensor_map']:
raise ValueError("Segment " + str(i) + " needs inputs")
if not segment['output_tensor_map']:
raise ValueError("Segment " + str(i) + " needs outputs")
self.input_shapes = segment['input_tensor_map']
self.outputs = segment['output_tensor_map'].split(",")
if self.framework == "Pytorch":
self.device = torch.device('hpu')
model = torch.jit.load(
segment['compiled_model'][0]['compiled_obj']).to(self.device)
model.to(torch.bfloat16)
model.eval()
from habana_frameworks.torch.hpu import wrap_in_hpu_graph
model = wrap_in_hpu_graph(model)
self.model_runtimes.append(model)
def _get_fake_samples(self, batch_size, shape, input_type):
data = {}
if input_type:
i = 0
for key, val in shape.items():
if key != "text":
val = [val[0] * batch_size] + val[1:]
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[input_type[i]])
else:
data[key] = np.random.random(size=val).astype(
INPUT_TYPE[input_type[i]])
i += 1
return data
else:
raise ValueError("Please provide input type")
compiled_models/
pre_optimized_models/
__pycache__/
<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 797.92 157.03"><defs><style>.cls-1{fill:#292c31;}</style></defs><path class="cls-1" d="M570.12,83.64V72.08a.31.31,0,0,0-.31-.31H546.17a.33.33,0,0,1-.33-.33h0V57a.33.33,0,0,0-.33-.33H534a.33.33,0,0,0-.33.33h0V98.8a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V84.22a.32.32,0,0,1,.3-.33h23.67A.31.31,0,0,0,570.12,83.64Z"/><rect class="cls-1" x="545.84" y="45.52" width="30.34" height="11.13" rx="0.33"/><rect class="cls-1" x="545.84" y="99.13" width="30.34" height="12.14" rx="0.33"/><path class="cls-1" d="M527.67,111.25a.33.33,0,0,0,.3-.48l-13.62-27a.31.31,0,0,1,.13-.42l0,0c6.4-2.54,10.09-9.74,10.09-17,0-17.5-13.4-20.76-18.89-20.76H478.81a.34.34,0,0,0-.34.34h0v65.08a.34.34,0,0,0,.34.33H490.1a.33.33,0,0,0,.33-.33h0V87.68a.33.33,0,0,1,.33-.33h11.37a.31.31,0,0,1,.29.19l11.68,23.53a.33.33,0,0,0,.3.18Zm-26-35.85h-10.9a.33.33,0,0,1-.33-.34h0V57.8a.32.32,0,0,1,.33-.33h12.91c6.95,0,8.87,5.39,8.87,9.16C512.55,72.6,508.3,75.4,501.65,75.4Z"/><path class="cls-1" d="M437.92,44.28l-19.87,6.45a.43.43,0,0,0-.17.12L405.6,67.76a.31.31,0,0,0-.06.19V88.84a.36.36,0,0,0,.06.2l12.28,16.9a.3.3,0,0,0,.17.12l19.87,6.46a.34.34,0,0,0,.2,0L458,106.06a.28.28,0,0,0,.16-.12L470.43,89a.31.31,0,0,0,.07-.2V68a.26.26,0,0,0-.07-.19L458.15,50.85a.4.4,0,0,0-.16-.12l-19.87-6.45A.34.34,0,0,0,437.92,44.28Zm21,41L451,96.15a.31.31,0,0,1-.16.12l-12.76,4.15a.34.34,0,0,1-.2,0l-12.76-4.15a.3.3,0,0,1-.17-.12L417.11,85.3a.37.37,0,0,1-.07-.2V71.69a.32.32,0,0,1,.07-.19L425,60.64a.43.43,0,0,1,.17-.12l12.76-4.14a.34.34,0,0,1,.2,0l12.76,4.14a.48.48,0,0,1,.16.12l7.89,10.86a.31.31,0,0,1,.06.19V85.1A.36.36,0,0,1,458.93,85.3Z"/><path class="cls-1" d="M394.9,92a.35.35,0,0,0-.47,0c-3.08,3-7.28,8.26-15.68,8.26-11.75,0-20.85-7.65-20.85-21.73,0-14.6,8.82-22,20.85-22a22,22,0,0,1,15.68,7,.32.32,0,0,0,.45,0l0,0,7.7-8.36a.27.27,0,0,0,0-.36,34.12,34.12,0,0,0-24.54-10.57c-19.67,0-32.85,14.37-32.85,34.27,0,21.47,16.65,34.07,33.11,34.07,10.54,0,19.82-6.61,24.66-11.71a.29.29,0,0,0,0-.37Z"/><path class="cls-1" d="M324.61,71.83H295.94a.33.33,0,0,1-.33-.33h0V45.85a.33.33,0,0,0-.33-.33H283.8a.33.33,0,0,0-.33.33h0v65.09a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V84.3a.33.33,0,0,1,.33-.33h28.67a.33.33,0,0,1,.33.33h0v26.64a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V45.85a.33.33,0,0,0-.33-.33H325.27a.33.33,0,0,0-.33.33h0V71.49a.32.32,0,0,1-.31.34Z"/><path class="cls-1" d="M183.32,45.72,163.2,88.81a.31.31,0,0,0,0,.13v22a.33.33,0,0,0,.33.33H175a.33.33,0,0,0,.33-.33V93.39a.33.33,0,0,1,.33-.33h29.69a.33.33,0,0,1,.33.33v17.55a.33.33,0,0,0,.33.33h11.47a.33.33,0,0,0,.33-.33V89a.4.4,0,0,0,0-.15L197.2,45.71a.35.35,0,0,0-.3-.19H183.62A.33.33,0,0,0,183.32,45.72Zm-3.1,34.74L190,59.36a.33.33,0,0,1,.6,0l9.74,21.1a.33.33,0,0,1-.3.47H180.52A.33.33,0,0,1,180.22,80.46Z"/><path class="cls-1" d="M151.65,79.29s0,0,.06,0l4.14-12.75a.34.34,0,0,0,0-.2l-4.13-12.7a.3.3,0,0,0-.12-.17l-11-7.83a.31.31,0,0,0-.19-.06l-32.2,0a.34.34,0,0,0-.33.34v65.08a.33.33,0,0,0,.33.33h10.33a.33.33,0,0,0,.33-.33V87.69a.33.33,0,0,1,.33-.33l12.42-.14a.33.33,0,0,1,.3.19l11.73,23.68a.33.33,0,0,0,.3.18h11.92a.33.33,0,0,0,.3-.48L143.45,85.55a.34.34,0,0,1,.1-.42Zm-32.77-3.64V56.58a.33.33,0,0,1,.33-.33h17.73a.31.31,0,0,1,.19.06l5,3.61a.31.31,0,0,1,.12.16l1.9,5.85a.34.34,0,0,1,0,.2L142.21,72a.3.3,0,0,1-.12.17l-5.36,3.77a.35.35,0,0,1-.19.06H119.21A.33.33,0,0,1,118.88,75.65Z"/><path class="cls-1" d="M84.85,107.77v3.17a.33.33,0,0,0,.33.33H96.66a.33.33,0,0,0,.33-.33V75.19a.33.33,0,0,0-.33-.33H70a.33.33,0,0,0-.33.33V86.66A.33.33,0,0,0,70,87H84.52a.33.33,0,0,1,.33.33v6.49a.34.34,0,0,1-.16.29L75.5,99.72a.33.33,0,0,1-.17,0h-10a.33.33,0,0,1-.17,0l-8.89-5.44a.43.43,0,0,1-.11-.12L50.8,84.87a.29.29,0,0,1,0-.16V72.09a.33.33,0,0,1,0-.17l5.35-9.29a.43.43,0,0,1,.11-.12l8.89-5.43a.34.34,0,0,1,.17,0H75.85a.37.37,0,0,1,.18,0l8.09,5.31.08.06,3.59,4.37a.34.34,0,0,0,.46.05L97,60a.32.32,0,0,0,.06-.46l-4.2-5.38a.22.22,0,0,0-.09-.07l-13.5-8.52a.28.28,0,0,0-.18-.06h-17a.41.41,0,0,0-.18,0L47.65,54.3a.32.32,0,0,0-.12.11L39.3,69a.3.3,0,0,0,0,.16V87.65a.34.34,0,0,0,0,.17l8.21,14.51.06.12,14.34,8.77a.41.41,0,0,0,.18,0H78.57a.34.34,0,0,0,.17,0l5.86-3.59A.17.17,0,0,1,84.85,107.77Z"/><path class="cls-1" d="M239.79,57.81V75.09a.33.33,0,0,0,.33.33h23.22a.33.33,0,0,1,.33.33V87h0a.33.33,0,0,1-.33.33H240.12a.33.33,0,0,0-.33.33v23.24h0a.33.33,0,0,1-.33.33h-11.3a.33.33,0,0,1-.33-.33V45.85a.33.33,0,0,1,.33-.33h35.18a.33.33,0,0,1,.33.33v11.3a.33.33,0,0,1-.33.33H240.12A.33.33,0,0,0,239.79,57.81Z"/><rect class="cls-1" x="263.67" y="57.48" width="11.96" height="17.94" rx="0.33"/><path class="cls-1" d="M628,79.9a.37.37,0,0,0,.08-.54l-5.38-6.94a.36.36,0,0,0-.52-.08l-4.66,3.41V62.42h6.71a.36.36,0,0,0,.38-.38V53a.36.36,0,0,0-.38-.38h-6.71v-13a.36.36,0,0,0-.38-.38H607.6a.37.37,0,0,0-.38.38v13h-8.78a.35.35,0,0,0-.37.38v9a.34.34,0,0,0,.37.38h8.78V79.67h-8.76a.33.33,0,0,0-.37.37v9a.35.35,0,0,0,.37.37h8.76v14.44c0,1.79-.89,2.69-3,2.69a31.05,31.05,0,0,1-5.14-.45.37.37,0,0,0-.45.42l1.34,9.9a.33.33,0,0,0,.3.32,39.61,39.61,0,0,0,5.93.49c7.45,0,11.31-3.86,11.31-10.86V87.55Z"/><path class="cls-1" d="M648.88,77.53a.36.36,0,0,0,.49.27l8.9-3.16a.34.34,0,0,0,.24-.41A130.61,130.61,0,0,0,652,52.75a.34.34,0,0,0-.45-.2l-8.45,2.65a.35.35,0,0,0-.23.49C645.4,61.94,647.71,71.82,648.88,77.53Z"/><path class="cls-1" d="M758.44,102.82c-10.39-4.53-20.06-12.92-24.69-22.09h23.91a.36.36,0,0,0,.38-.38V70.87a.36.36,0,0,0-.38-.38H725.81V60h27.45a.35.35,0,0,0,.38-.37V50.13a.35.35,0,0,0-.38-.37H725.81V39.63a.35.35,0,0,0-.37-.38h-9.85a.35.35,0,0,0-.37.38V49.76h-27a.35.35,0,0,0-.37.37v9.49a.35.35,0,0,0,.37.37h27v10.5H683.81a.36.36,0,0,0-.37.38v9.48a.36.36,0,0,0,.37.38h24.7v6.55a.38.38,0,0,0,.38.38h6.33v28.87a.34.34,0,0,0,.37.38h9.85a.35.35,0,0,0,.37-.38V85.37c3.78,10.69,15.93,22.35,26.64,26.84a.35.35,0,0,0,.46-.15l5.7-8.7A.36.36,0,0,0,758.44,102.82Z"/><path class="cls-1" d="M668.62,94.48l4.23-12.81a.75.75,0,0,0,0-.24V40.54a.37.37,0,0,0-.38-.37h-9.66a.36.36,0,0,0-.38.37V80L657,95.65l-18.56,13.51a.38.38,0,0,0-.08.53l5.75,7.92a.37.37,0,0,0,.53.08L663,104l10.41,12.92a.38.38,0,0,0,.54.06l7.33-6.25a.36.36,0,0,0,0-.52Z"/><path class="cls-1" d="M640,105.78V96.1h9.41a.37.37,0,0,0,.37-.38v-9.3a.37.37,0,0,0-.37-.38H640V43.78a.36.36,0,0,0-.38-.38h-9.3a.36.36,0,0,0-.38.38V96.1h-6.45a.36.36,0,0,0-.37.37v9.31a.35.35,0,0,0,.37.37h16.13A.37.37,0,0,0,640,105.78Z"/><path class="cls-1" d="M708.14,87.66h-9.31a.35.35,0,0,0-.37.37v12.19a.36.36,0,0,0,.37.38h9.31a.37.37,0,0,0,.37-.38V88A.36.36,0,0,0,708.14,87.66Z"/><path class="cls-1" d="M698.08,100.59H687a.35.35,0,0,0-.37.37v9.31a.36.36,0,0,0,.37.37h11.1a.36.36,0,0,0,.37-.37V101A.35.35,0,0,0,698.08,100.59Z"/></svg>
\ No newline at end of file
[
{
"name": "batch_sizes",
"note": "The batch sizes run with benchmark",
"dialog_type": "Input Dialog",
"type": "str",
"default": "",
"depends": null
},
{
"name": "converter_options",
"note": "PopRT converter options will be used, please referes to the PopRT documentation",
"dialog_type": "Input Dialog",
"type": "str",
"default": "{}",
"depends": null
},
{
"name": "compiler_options",
"note": "PopRT compiler options will be used, please referes to the PopRT documentation",
"dialog_type": "Input Dialog",
"type": "str",
"default": "{}",
"depends": null
},
{
"name": "clients",
"note": "The number of clients used to run the benchmark",
"dialog_type": "Input Dialog",
"type": "str",
"default": "1",
"depends": null
},
{
"name": "pack",
"note": "Use pack solution to run the model or not",
"dialog_type": "Yes/No Dialog",
"type": "bool",
"default": "false",
"depends": null
},
{
"name": "runtime_options",
"note": "PopRT runtime options will be applied to RuntimeConfig, please referes to the PopRT documentation",
"dialog_type": "Input Dialog",
"type": "str",
"default": "{}",
"depends": null
},
{
"name": "precision",
"note": "请指定模型的精度",
"dialog_type": "Radiolist Dialog",
"options": [
"fp8",
"fp16"
],
"type": "str",
"default": "FP16",
"depends": null
}
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment