Commit 24b257f1 authored by sunzhq2's avatar sunzhq2
Browse files

init

parent 920b3c0f
numpy==1.23.0
protobuf==3.20.2
torch==1.13.1
tensorflow==2.7.2
tf2onnx
wrapt==1.14.1
onnx==1.13.1
bert-tensorflow==1.0.1
opencv-python-headless
torchvision
# Copyright 2023 Graphcore Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import numpy as np
from poprt.runtime import PackAlgorithm, PackRunnerConfig, RuntimeConfig
from general_perf.backends import runtime_backend
from . import engine_poprt
log = logging.getLogger("RuntimeBackendIPU")
class RuntimeBackendIPU(runtime_backend.RuntimeBackend):
def __init__(self):
super(RuntimeBackendIPU, self).__init__()
self.hardware_type = "IPU"
self.need_reload = False
self.model_runtimes = []
self.configs = None
self.pack_config = None
self.batch_size = -1
self.pack_bs = -1
self.packrunner = False
self.engine = None
self.runner_name = "POPRT"
self.compiled_dir = (
os.path.split(os.path.abspath(__file__))[0] + "/compiled_models"
)
self.precision = "fp32"
def predict(self, feeds, test_benchmark=False):
# apply samll adjustments to ipu results to align with cpu's
self._input_adjustment(feeds)
results = self.engine.predict(feeds)
if "videobert" in self.workload["model"]:
# open_cifar required outputs as: logits_per_image, logits_per_text
return results["3034"], results["3035"]
return results
def _get_engine(self, batch_size):
if not self.batch_size == batch_size:
self.update_packrunner_info()
self.batch_size = batch_size if not self.packrunner else 1
interact_info = self.configs.get("interact_info", {})
interact_info.get("runtime_options", {})
is_pack = interact_info.get("pack_config", False)
if not is_pack:
config = RuntimeConfig()
else:
config = PackRunnerConfig()
# set the time out to 0 since the test_accuracy.py does not support async: let the packing timeout in poprt asap
assert interact_info.get(
"pack_config"
), "pack mode requires 'pack_config'"
self.pack_config = interact_info["pack_config"]
assert (
"dynamic_input_name" in self.pack_config
), "you must specify the name of the input who has dynamic length."
assert (
"mask_name" in self.pack_config
), "you must specify the name of 'mask' input for pack runner."
assert (
"input_names" in self.pack_config
), "you must specify all input names for pack runner for auto padding removal."
mask_name = self.pack_config["mask_name"]
if self.configs["model"] == "deberta-torch-fp32":
config.enable_input_single_row_mode(mask_name, "unpack_info", 1)
else:
config.enable_input_single_row_mode(mask_name)
config.dynamic_input_name = self.pack_config["dynamic_input_name"]
config.timeout_microseconds = self.pack_config.get(
"timeout_microseconds", 15000
)
# best performance mode
config.algorithm = PackAlgorithm.first_fit
config.max_valid_num = self.pack_config.get("max_pack_num", 40)
# remove user provided padded zeros in pack runner
config.enable_padding_remove_mode(
self.pack_config["mask_name"],
[n for n in self.pack_config["input_names"] if n != mask_name],
)
if self.runner_name == "POPRT":
if self.engine:
del self.engine
self.engine = engine_poprt.PopRT(self.popef_path, config)
else:
raise ValueError("engine_name must be POPRT")
return self.engine
def benchmark(self, dataloader):
report = {}
report["BS"] = self.batch_size
interact_info = self.configs.get("interact_info", {})
if self.packrunner:
report["BS"] = self.pack_bs
iterations = self.workload["iterations"]
qps, avg_latency, tail_latency = self.engine.benchmark_pack(
interact_info["pack_config"], iterations
)
else:
iterations = self.workload["iterations"]
clients = interact_info.get("clients", 1)
qps, avg_latency, tail_latency = self.engine.benchmark(
clients, self.batch_size, iterations
)
report["QPS"] = int(qps)
report["AVG Latency"] = avg_latency
report["P99 Latency"] = tail_latency
return report
def get_loaded_batch_size(self):
# return self.workload['batch_sizes'][0]
return self.batch_size
def load(self, batch_size) -> None:
# synchronize configuration updates from compile backend
self.update_packrunner_info()
self.precision = (
self.configs.get("interact_info", {}).get("converter_options", {})
.get("precision", "FP32")
.upper()
)
if self.packrunner:
batch_size = self.pack_bs
self.popef_path = os.path.join(
self.compiled_dir,
self.configs["model"],
str(batch_size),
"executable_{}.popef".format(self.precision),
)
self._get_engine(batch_size)
def update_packrunner_info(self):
interact_info = self.configs.get("interact_info", {})
is_pack = interact_info.get("pack_config", False)
if not is_pack:
return
pack_config = interact_info["pack_config"]
if is_pack:
self.packrunner = True
self.pack_bs = pack_config["batch_size"]
def _input_adjustment(self, inputs):
# packing mode require "position_ids" for bert-like models
if self.packrunner:
seq_len = np.count_nonzero(inputs[self.pack_config["mask_name"]])
if self.configs["model"] == "roberta-torch-fp32":
inputs["position_ids"] = np.arange(seq_len, dtype=np.int32) + 1
elif self.configs["model"] in ("albert-torch-fp32", "bert-torch-fp32"):
inputs["position_ids"] = np.arange(seq_len, dtype=np.int32)
elif self.configs["model"] == "deberta-torch-fp32":
inputs["unpack_info"] = np.zeros(1, dtype=np.int32)
inputs.pop("token_type_ids")
# Sparse Processing Unit (SPU)
The inventor of the dual sparsity algorithm, Moffett AI has the world's leading sparse computing techniques with more
than 30 patents worldwide. The company creates a new generation of AI computing platform with hardware and software
co-design to achieve order-of-magnitude acceleration of computing performance, reducing latency and low TCO.
The result in Byte MLPerf has demonstrated the potential of sparse computing in inference performance and energy
efficiency, which leads to a lower total cost of ownership (TCO).
For Byte MLPerf, Moffett has submitted performance results of the following models.
| Model | Precision | Sparsity* |QPS | Dataset | Metric name | Metric value |
|----------------------|-----------|----------|------|----------------|-------------|--------------|
| resnet50-torch-fp32 | INT8 | 16x | 52423 | Open Imagenet | Top-1 | 76.61% |
| bert-torch-fp32 | INT8/BF16 | 16x | 7738 | Open Squad 1.1 | F1 Score | 86.09 |
| albert-torch-fp32 | INT8/BF16 | 16x | 10824 | Open Squad 1.1 | F1 Score | 87.66 |
| roberta-torch-fp32 | INT8/BF16 | 16x | 8107 | Open Squad 1.1 | F1 Score | 86.63 |
| conformer-encoder-onnx-fp32 | INT8/BF16 | 8x | 8211 | Fake Dataset | Mean Diff | 1.50 |
\* The sparsity is determined by the ratio of time spent on Matmul operations compared to the overall time of model inference.
Besides the performance results, energy efficiency is another significant highlight of Moffett's devices. For example,
the peak power consumption of S30 is merely
250W.
The Antoum architecture through hardware and software co-design and Moffett's original sparsity algorithm are the
reasons to achieve great performance with high energy efficiency.
The accelerators for AI inference applications in data centers are equipped with Moffett's 1st generation Antoum
processor - the first commercial AI processor with 32x sparsity in the world.
Besides the sparse processing units (SPU) for native sparse convolution and matrix computing in Antoum, the processor
also integrates a Vector Processing Unit (VPU), which enables flexible programmability to keep up with the fast
evolution of AI models.
Also, the on-chip Video Codec, which supports 192-way 1080p video decoding at 30 FPS, and the JPEG decoder, which
supports 1080p image decoding up to 6960 FPS, provide an end-to-end capability for video and image inference workloads.
Moffett provides three SKUs of sparse computing devices, namely S4, S10, and S30. Based on the computing power of S4,
S10 and S30 are designed to be 2 times and 3 times the computing power of S4 respectively.
## How to run
### 1. Environmental preparation
#### Download offline image
```bash
wget moffett-oss-bucket01.oss-cn-shenzhen.aliyuncs.com/byte-perf/byte-perf-2.3.2-20230721.tar
```
#### Load offline image
```bash
docker load -i byte-perf-2.3.2-20230721.tar
```
#### Decompress the model data package
```bash
wget moffett-oss-bucket01.oss-cn-shenzhen.aliyuncs.com/byte-perf/byte-perf-data.tar.gz
tar -zxvf byte-perf-data.tar.gz
```
### 2. Create docker container
notes: --shm-size="300g" is recommended to be 95% of the total memory of the host.
```bash
cd byte-perf-data
sudo docker run -itd \
--privileged \
--cap-add=ALL \
--net=host \
-v /dev:/dev \
-v /usr/src:/usr/src \
-v /lib/modules:/lib/modules \
-v $PWD/package:/home/moffett/workspace/package \
-e ROOT_PASS=moffett \
--shm-size="300g" \
--name byte-perf-2023 \
byte-perf:2.3.2-20230721
```
### 3. Environment initialization
Environment initialization please operate in the container.
```bash=
docker exec -it byte-perf-2023 /bin/bash
```
#### Install drivers and load firmware
```bash
cd /usr/local/sola/driver/bin/
sudo ./setup.sh
```
#### Device basic information verification
mf-smi is a command line utility that can view various information of S30, such as card number, usage, temperature, power consumption, etc.
After the driver is successfully installed, execute mf-smi to view the basic information of the device.
```bash
mf-smi
```
### 4. Run byte-mlperf task in container
```bash=
cd /home/moffett/workspace/package/bytemlperf
# config spu-backend env
export PYTHONPATH=$PYTHONPATH:/home/moffett/workspace/spu-backend-release/ubuntu18.04-gcc7.5.0-x86_64/lib/
# conformer
python3 launch.py --task conformer-encoder-onnx-fp32 --hardware_type SPU
# albert
python3 launch.py --task albert-torch-fp32 --hardware_type SPU
# bert
python3 launch.py --task bert-torch-fp32 --hardware_type SPU
# roberta
python3 launch.py --task roberta-torch-fp32 --hardware_type SPU
# resnet50
python3 launch.py --task resnet50-torch-fp32 --hardware_type SPU
```
## Contact us
If you are interested in further information about the products, please contact the email: sales@moffett.ai
from torch.utils.data import DataLoader as DataLoaderX
from dataset.dataset import ImageNetDataset,MZJBertDataset,DummyDataset
from nn_compiler.common.constants import OpType
from common_compile import SparsertBaseBuilder
import onnx
def get_onnx_input_info(onnx_model_path):
# Load ONNX model
model = onnx.load(onnx_model_path)
# Initialize an empty dictionary to store input names and shapes
input_info = {}
# Iterate through the inputs of the model
for input in model.graph.input:
input_name = input.name
input_shape = [dim.dim_value for dim in input.type.tensor_type.shape.dim]
input_info[input_name] = input_shape
return input_info
def get_model_input_info(onnx_input_info,batch_size):
config_input_dict = {}
input_shape_dict = {}
for input_name,input_shape in onnx_input_info.items():
config_input_dict[input_name] = input_name
input_shape[0] = batch_size
input_shape_dict[input_name] = input_shape
return config_input_dict,input_shape_dict
class Resnet50Builder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(Resnet50Builder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# calibration dataset config
dataset = ImageNetDataset(self.dataset_dir, transform_file=self.dataset_cfg)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size)
self.config.calib_batch = 1
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# you can also set other configs here
self.config.do_kl = True
self.config.opt_level = 8
self.config.total_cores = 1
class BertBaseBuilder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(BertBaseBuilder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# calibration dataset config
dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
self.config.calib_batch = 1
# you can also set other configs here
self.config.do_kl = False
self.config.opt_level = 5
self.config.safe_exp = False
self.config.quantized_patterns = [[OpType.BatchMatmul]]
class AlbertBuilder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(AlbertBuilder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# calibration dataset config
dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
self.config.calib_batch = 1
# you can also set other configs here
self.config.do_kl = False
self.config.opt_level = 5
self.config.safe_exp = False
self.config.quantized_patterns = [[OpType.BatchMatmul]]
class RobertaBuilder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(RobertaBuilder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# calibration dataset config
dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
self.config.calib_batch = 1
# you can also set other configs here
self.config.do_kl = False
self.config.opt_level = 5
self.config.safe_exp = False
self.config.quantized_patterns = [[OpType.BatchMatmul]]
class ConformerBuilder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(ConformerBuilder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# calibration dataset config
dataset = DummyDataset(input_info=self.config.input_dict)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
self.config.calib_batch = 1
# you can also set other configs here
self.config.do_kl = False
self.config.opt_level = 5
self.config.safe_exp = False
self.config.quantized_patterns = [[OpType.BatchMatmul]]
class GeneralBuilder(SparsertBaseBuilder):
def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
super(GeneralBuilder, self).__init__(
onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
def set_dataset_config(self):
# model inputs info
onnx_input_info = get_onnx_input_info(self.onnx_path)
self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
# calibration dataset config
dataset = DummyDataset(input_info=self.config.input_dict)
self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
self.config.calib_batch = 1
# you can also set other configs here
self.config.do_kl = False
self.config.opt_level = 5
self.config.safe_exp = False
self.config.quantized_patterns = [[OpType.BatchMatmul]]
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import logging
from typing import Any, Dict, List, Optional
from general_perf.backends import compile_backend
from base_compile import Resnet50Builder, BertBaseBuilder, AlbertBuilder, RobertaBuilder, ConformerBuilder, GeneralBuilder
log = logging.getLogger("CompileBackendSPU")
class CompileBackendSPU(compile_backend.CompileBackend):
def __init__(self):
super(CompileBackendSPU, self).__init__()
self.hardware_type = "SPU"
self.batch_size = None
self.model_name = ""
self.configs = None
self.workload = None
self.model_info = None
self.model = None
self.interact_info = None
def compile(self,
configs: Dict[str, Any],
dataloader=None) -> Dict[str, Any]:
"""
Model compilation interface. Model conversion and compilation
can be performed here. The model format can be changed here.
"""
model_info = configs["model_info"]
name = model_info ['model']
builder_dict = {
"resnet50-torch-fp32": Resnet50Builder,
"bert-torch-fp32": BertBaseBuilder,
"albert-torch-fp32": AlbertBuilder,
"roberta-torch-fp32": RobertaBuilder,
"conformer-encoder-onnx-fp32": ConformerBuilder
}
if name in builder_dict:
SparserBuilder = builder_dict[name]
else:
SparserBuilder = GeneralBuilder
interact_info = self.get_interact_profile(configs)
onnx_path = interact_info["onnx_path"]
dump_dir=os.path.dirname(os.path.abspath(interact_info["model_path"]))
dataset_dir = interact_info["calibration_dir"]
dataset_cfg = interact_info["transform_file"]
model_precision = interact_info["model_precision"]
batch_size = interact_info["batch_size"]
verify = interact_info["verify"]
builder = SparserBuilder(onnx_path, dump_dir, dataset_dir, dataset_cfg, model_precision, batch_size, verify)
compile_info = builder()
result = {
"model": configs["model_info"]["model"],
"framework": configs["model_info"]["framework"],
"compile_precision": model_precision,
"input_type": configs["model_info"]["input_type"].split(","),
"max_batch_size": configs["workload"]["batch_sizes"][-1],
"compile_status":"success",
"sg_percent": 100,
"sparsity_ratio":compile_info["sparsity_ratio"],
"segments": [
{
"sg_idx": 0,
"is_fallback": False,
"input_tensor_map": configs["model_info"]["input_shape"],
"output_tensor_map": configs["model_info"]["outputs"],
"compiled_model": [
{
"compiled_bs": batch_size,
"compiled_obj": dump_dir,
},
],
},
],
"interact_info": interact_info,
}
return result
def get_interact_profile(self, configs: Dict[str, Any]):
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"interact_info/{configs['model_info']['model']}.json")
if os.path.exists(file_path):
with open(file_path, 'r') as f:
model_profile = json.load(f)
return model_profile
else:
log.info('File path: {} does not exist, please check'.format(file_path))
raise NotImplementedError("CompileBackend:get_interact_profile")
def get_best_batch_size(self) -> Optional[List[int]]:
"""
Get Best Batch Size for the model
"""
return [1]
{
"model": "albert-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/albert.onnx",
"model_path": "general_perf/download/moffett/converted_models/albert-mf-int8/albert-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/albert-base-v2_squad.npy",
"transform_file": "",
"batch_size": 12,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
{
"model": "bert-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/bert-base.onnx",
"model_path": "general_perf/download/moffett/converted_models/bert-mf-int8/bert-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/bert-base-uncased_squad.npy",
"transform_file": "",
"batch_size": 12,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
{
"model": "conformer-encoder-onnx-fp32",
"onnx_path": "general_perf/download/moffett/models/conformer.onnx",
"model_path": "general_perf/download/moffett/converted_models/conformer-mf-int8/conformer-mf-int8.zip",
"calibration_dir": " ",
"transform_file": "",
"batch_size": 4,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "none",
"framework": "sparsert"
}
{
"model": "resnet50-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/resnet50.onnx",
"model_path": "general_perf/download/moffett/converted_models/resnet50-mf-int8/resnet50-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/imgnet_calibrate_data",
"transform_file": "general_perf/download/moffett/compiler_wrapper/resnet50-mf-int8/mxnet_imagenet_trans_224.json",
"batch_size": 4,
"verify": false,
"model_precision": "INT8",
"dataset_name": "open_imagenet",
"framework": "sparsert"
}
{
"model": "roberta-torch-fp32",
"onnx_path": "general_perf/download/moffett/models/roberta.onnx",
"model_path": "general_perf/download/moffett/converted_models/roberta-mf-int8/roberta-mf-int8.zip",
"calibration_dir": "general_perf/download/moffett/datasets/roberta-base_squad.npy",
"transform_file": "",
"batch_size": 4,
"verify": false,
"model_precision": "MixInt8bf16",
"dataset_name": "open_squad",
"framework": "sparsert"
}
matplotlib==3.4.2
mypy-protobuf==2.8
protobuf==3.19.6
numpy==1.23.4
pandas==1.5.1
torch==1.9.1
tensorflow==2.8.4
bert-tensorflow==1.0.1
torchvision==0.10.1
sentencepiece==0.1.96
tokenization
tensorflow-datasets==4.7.0
google-cloud-core==2.3.2
import logging
import tensorflow as tf
import torch
import numpy as np
import time
import yaml
import multiprocessing
from multiprocessing import Manager
from general_perf.backends import runtime_backend
from inference import ModelFactory
from threading import Thread
hardware_type = "spu".upper()
tf.get_logger().setLevel('ERROR')
log = logging.getLogger(f"Backend-{hardware_type}")
bfloat16 = tf.bfloat16.as_numpy_dtype
pt_dtype_map = {
"FLOAT32": torch.float32,
"FLOAT16": torch.float16,
"INT8": torch.int8,
"LONG": torch.long
}
tf_dtype_map = {
"FLOAT32": tf.float32,
"FLOAT16": tf.float16,
"INT32": tf.int32,
}
INPUT_TYPE = {
"UINT8": np.uint8,
"INT8": np.int8,
"FLOAT32": np.float32,
"LONG": np.long,
"INT32": np.int32,
"INT64": np.int64
}
class RuntimeBackendSPU(runtime_backend.RuntimeBackend):
def __init__(self):
super(RuntimeBackendSPU, self).__init__()
self.hardware_type = hardware_type
self.batch_size = None
self.input_rank = []
self.model_name = ""
self.current_batch_size = 4
self.dry_run = False
self.output_dtype = None
self.output_shape = None
self.configs = None
self.workload = None
self.model_info = None
self.model = None
self.order = None
self.yaml_config = None
self.need_reload = True
self.all_resnet50_start_time_list = []
self.all_resnet50_end_time_list = []
def predict(self, feeds):
input_name_list = self.configs['input_name']
if not self.model:
log.info("no model_runtime...")
self.load(self.get_loaded_batch_size())
if self.model_name == "resnet50-torch-fp32":
request = [feeds[name] for name in input_name_list]
response = self.model.inference(request)
return response
elif self.model_name == "conformer-encoder-onnx-fp32":
request = [feeds[name] for name in input_name_list]
response = self.model.inference(request)
return response
elif self.model_name in ["bert-torch-fp32", "albert-torch-fp32", "roberta-torch-fp32"]:
request, model_info = self.model.preprocess(feeds, self.yaml_config)
response = self.model.inference(request)
response = self.model.postprocess(response, model_info)
return response
else:
raise NotImplementedError(f"task: {self.model_name} not supported")
def callback_func(self):
end_time = time.time() * 1000
self.all_resnet50_end_time_list.append(end_time)
def benchmark(self, dataloader):
batch_sizes = self.workload['batch_sizes']
reports = []
iterations = self.workload['iterations']
for idx, batch_size in enumerate(batch_sizes):
if batch_size != self.batch_size:
continue
self.yaml_config.update(
{"min_batch_size": self.yaml_config['chunk_size']})
report = {}
qps = None
dataloader.rebatch(batch_size)
input_name_list = self.configs['input_name']
if self.model_name == "resnet50-torch-fp32":
test_data, _ = dataloader.get_samples(0)
all_resnet50_start_time_list = self.all_resnet50_start_time_list
all_resnet50_end_time_list = self.all_resnet50_end_time_list
self.model = ModelFactory(self.model_info)
model = self.model
model.load_model()
model.device_num = 3
request = [test_data[name] for name in input_name_list]
for _ in range(iterations):
resnet50_start_time = time.time()
all_resnet50_start_time_list.append(resnet50_start_time * 1000)
output_data = self.model.inference(request, self.callback_func)
start_time_list = all_resnet50_start_time_list
end_time_list = all_resnet50_end_time_list
elif self.model_name == "conformer-encoder-onnx-fp32":
test_data = dataloader.get_samples(0)
all_conformer_start_time_list = []
all_conformer_end_time_list = []
self.model = ModelFactory(self.model_info)
model = self.model
model.load_model()
model.device_num = 3
request = [test_data[name] for name in input_name_list]
start = time.time()
for _ in range(iterations):
conformer_start_time = time.time()
all_conformer_start_time_list.append(conformer_start_time * 1000)
output_data = self.model.inference(request)
conformer_end_time = time.time()
all_conformer_end_time_list.append(conformer_end_time * 1000)
start_time_list = all_conformer_start_time_list
end_time_list = all_conformer_end_time_list
elif self.model_name in ["bert-torch-fp32", "albert-torch-fp32", "roberta-torch-fp32"]:
test_data, _ = dataloader.get_samples(0)
def input_worker(_input_queue, data, iteration, shared_list):
for i in range(iteration):
batch_start_time = time.time()
shared_list.append(batch_start_time)
_input_queue.put(data)
_input_queue.put(None)
return
def preprocessing_worker(_input_queue, _preprocess_queue, _info_queue, model_info):
while True:
data = _input_queue.get()
if data is None:
_info_queue.put(None)
_preprocess_queue.put(None)
return
input_data_list, info = self.model.preprocess(data, model_info)
_preprocess_queue.put(input_data_list)
_info_queue.put(info)
def inference_worker(_preprocess_queue, _inference_queue, config):
self.model = ModelFactory(config)
model = self.model
model.load_model()
model.device_num = 3
while True:
data = _preprocess_queue.get()
if data is None:
_inference_queue.put(None)
model.destroy()
return
output_data = self.model.inference(data)
_inference_queue.put(output_data)
def postprocessing_worker(_inference_queue, _postprocess_queue, _info_queue):
while True:
data = _inference_queue.get()
info = _info_queue.get()
if data is None:
_postprocess_queue.put(None)
_info_queue.put(None)
return
_postprocess_queue.put(self.model.postprocess(data, info))
def consumer(_postprocess_queue, shared_end_list):
ans = []
while True:
i = _postprocess_queue.get()
if i is None:
return ans
ans.append(i)
batch_end_time = time.time()
shared_end_list.append(batch_end_time)
manager = Manager()
shared_start_list = manager.list()
shared_end_list = manager.list()
# Inference Pipeline
input_queue = multiprocessing.JoinableQueue()
preprocess_queue = multiprocessing.JoinableQueue()
info_queue = multiprocessing.JoinableQueue()
inference_queue = multiprocessing.JoinableQueue()
postprocess_queue = multiprocessing.JoinableQueue()
# [0] 获取数据的进程
input_process = multiprocessing.Process(
target=input_worker, args=(input_queue, test_data, iterations, shared_start_list))
# [1] 模型前处理进程
preprocessing_process = multiprocessing.Process(
target=preprocessing_worker, args=(input_queue, preprocess_queue, info_queue, self.yaml_config))
# [2] 模型推理进程
inference_process = Thread(
target=inference_worker, args=(preprocess_queue, inference_queue, self.yaml_config))
# [3] 模型后处理进程
postprocessing_process = multiprocessing.Process(
target=postprocessing_worker, args=(inference_queue, postprocess_queue, info_queue))
# 开始计时
input_process.start()
preprocessing_process.start()
inference_process.start()
postprocessing_process.start()
processes = [input_process, preprocessing_process, inference_process, postprocessing_process]
responses = consumer(postprocess_queue, shared_end_list)
for p in processes:
p.join()
start_time_list = list(shared_start_list)
end_time_list = list(shared_end_list)
else:
raise NotImplementedError(f"task: {self.model_name} not supported")
# 结束计时
all_latency = [(x - y) * 1000 if self.model_name != "resnet50-torch-fp32" else x - y for x, y in zip(end_time_list, start_time_list)]
all_latency.sort()
index = int(len(all_latency) * 0.99)
tail_latency = all_latency[index] / 1000
avg_latency = sum(all_latency) / len(all_latency) / iterations
if not qps:
qps = round(1000 * batch_size / avg_latency, 2)
tail_latency = round(tail_latency, 2)
avg_latency = round(avg_latency, 2)
qps = round(qps, 2)
log.info(
"\033[32m" + f"Report: Batch Size is {batch_size}, QPS is {qps}, AVG Latency is {avg_latency} ms, P99 Latency is {tail_latency} ms" + "\033[0m")
report['BS'] = batch_size
report['QPS'] = qps
report['AVG Latency'] = avg_latency
report['P99 Latency'] = tail_latency
print(f"AVG Latency:{avg_latency}, P99 Latency:{tail_latency}")
reports.append(report)
return reports
def get_loaded_batch_size(self):
# only used in accuracy mode, not in benchmark.
name = self.configs['model']
self.yaml_config.update(
{"min_batch_size": self.yaml_config['chunk_size']})
if "bert-torch-fp32" in name or "albert-torch-fp32" in name:
return 12
elif "roberta-torch-fp32" in name:
return 4
elif "resnet50-torch-fp32" in name:
return 16
elif "conformer-encoder-onnx-fp32" in name:
return 16
else:
raise NotImplementedError(f"task : {name} not supported")
def load(self, batch_size):
self.batch_size = batch_size
self.model_name = self.configs['model']
self.model_info.update({"input_name": self.model_info['inputs'].split(",")})
task_name = self.model_info["model"]
self.yaml_config = yaml.safe_load(open(f"./general_perf/download/moffett/converted_models/{task_name}.yaml", "r"))
self.yaml_config.update({
"model": self.configs["model"],
"input_name": self.model_info["input_name"],
"dataset_name": self.model_info['dataset_name']
})
if 'input_order' in self.yaml_config["model_input"][0]:
self.yaml_config.update(
{"input_order": [inp['input_order'] for inp in self.yaml_config["model_input"]]})
if self.need_reload:
self.model = ModelFactory(self.model_info)
self.model.load_model()
self.model.device_num = 1
self.need_reload = False
else:
log.info("model has been loaded, skip load process")
<div align="center">
<img src="STC.jpg">
</div>
# Supported model inference results
| Model name | QPS | Dataset | Metric name | Metric value |
| :-----:| :----: | :----: | :----: | :----: |
| albert-torch-fp32 | 824.49 | Open Squad 1.1 | F1 Score | 87.66 |
| bert-tf-fp32 | 822.38 | Open Squad 1.1 | F1 Score | 86.45 |
| bert-torch-fp32 | 813.86 | Open Squad 1.1 | F1 Score | 86.14 |
| resnet50-tf-fp32 | 8725.94 | Open ImageNet | Top-1 | 77.24% |
| robert-torch-fp32 | 800.7 | Open Squad 1.1 | F1 Score | 83.19 |
| widedeep-tf-fp32 | 2395899.9 | Open Criteo Kaggle | Top-1 | 77.39% |
For more detailed result information, see general_perf/reports/STC/. Models above are depolyed on a NPU (Neural-network Processing Unit) card "STCP920" which is designed and manufactured by Beijing Stream Computing Technology Co., LTD. Softwares associated with STCP920 are as following:
| Software | Version | Description |B
| :-----:| :----: | :----: |
| HPE | 1.5.1 | Heterogeneous Programming Environment |
| TensorTurbo | 1.11.0 | An AI compiler for STCP920 developed based on TVM |
| STC_DDk | 1.1.0 | Deploy Development Kits for STCP920, which includes AI Convertor, AI Executor, and utilities used in model conversion. |
In addition, a variety of tools for monitoring status of NPU devices, debugging heterogeneous programs, and analyzing accuracy and performance of NPU programs are provieded.
| Software | Description |
| :-----:| :----: |
| stc-smi | Stream Computing System Management Interface for managing and monitoring NPU devices, including viewing device information and resource usage |
| stc-gdb | Stream Computing Debugger for debugging heterogeneous NPU programs |
| stc-prof | Stream Computing Profiler, for performance analysis and optimization of heterogeneous programs |
| stc-hpaa | Stream Computing Half-Precision Accuracy Analysis, for locating the calculation error location and corresponding data |
For more detailed software information, please refer to: https://docs.streamcomputing.com/_/sharing/vSxLMI20nalGphdpXdEVoDg6JkUcfEkT?next=/zh/latest/
# How to run
1. Prepare environment
Prepare a machine with the STCP920 chip, install HPE, install -r general_perf/requirements.txt. Then create a virtual environment, install -r general_perf/backends/STC/requirements.txt, install Tensorturbo and STC_DDK. These installation packages can be obtained by visiting this link: https://docs.streamcomputing.com/_/sharing/vSxLMI20nalGphdpXdEVoDg6JkUcfEkT?next=/zh/latest/
```bash
export PYTHONPATH=$PYTHONPATH:ByteMLPerf:ByteMLPerf/general_perf/backends/STC
```
2. Prepare model and dataset
Run general_perf/prepare_model_and_dataset.sh to get model and dataset.
3. Run
```bash
python3 launch.py --tasks xxx --hardware_type STC
```
--task parameter is the name of the incoming workload. You need to specify the workload. For example, if you would like to evaluate the workload: bert-tf-fp16.json, you need to specify --task bert-tf-fp16.
# Company introduction
Beijing Stream Computing Technology Co., LTD, is committed to providing cloud service manufacturers with high cost performance and high versatility of AI accelerated chips.
The first-generation chip achieves 128 TFLOPS in semi-precision floating-point operations, twice as big as T4. At present, the first-generation NPU card 'STCP920' is in mass production, and has completed a batch of shipments to users. The second-generation products are in schedule and will be coming soon in 2023.
# The technical specifications of the first-generation chip
| Name | Value |
| :-----:| :----: |
| AI Computation power | 128 TFLOPS @ FP16 |
| Memory Type | LPDDR4X |
| Memory | 16GB, 119.4GB/S |
| Last Level Buffer | 8MB, 256GB/s |
| Level 1 Buffer | 1.25MB, 512GB/s |
| Host Interface | PCIe 4, 16x, 32GB/s, support Lane Reversal |
| Thermal Design Power | 160W |
| Structural Dimension | 268.44mm x 111.15mm, single slot |
# What we have done
We provide development kits to support converting any deep learning model into an stc engine deploying it on a CPU+NPU server.
An AI compiler(TensorTurbo) is developed to convert certain part of a deep learning model into an NPU-executable file. The AI compiler employs a series of transformations and optimizations in the process of model conversion, to ensure better inference performance of the outcome.
Using the associated softwares, we have supported over 150 open source models from four deep learning frameworks including tensorflow 1.x and 2.x, pytorch, onnx, paddlepaddle. The application fields include CV, NLP, recommendation, speech, OCR, multimodel. Most of the models achieve 2x inference performance compared to Nvidia GPU T4.
# Contact us
If you are interested in further information about the product, please contact the email: johnson@streamcomputing.com
{
"albert-torch-fp32": {
"best_batch": 4
},
"bert-tf-fp32": {
"best_batch": 2
},
"bert-torch-fp32": {
"best_batch": 2
},
"resnet50-tf-fp32": {
"best_batch": 16
},
"robert-torch-fp32": {
"best_batch": 8
},
"widedeep-tf-fp32": {
"best_batch": 2048
}
}
tensorflow==1.15.0
protobuf==3.19.4
decorator
graphviz==0.8.4
scipy
attrs==21.2.0
pyyaml
synr==0.4
multipledispatch
pytest
matplotlib==3.3.4
pulp
scikit-learn
torchvision==0.13.1
torch==1.12.1
onnx==1.12.0
onnxconverter-common==1.12.2
onnxruntime==1.12.0
pybind11>=2.9.1
bert-tensorflow==1.0.1
tqdm
psutil
pydot
sentencepiece==0.1.96
virtualenv==16.7.9
keras2onnx
pybind11
setuptools==67.5
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment