init

24b257f1 · sunzhq2 · 920b3c0f · 24b257f1 · 24b257f1 · 24b257f1
Commit 24b257f1 authored Nov 19, 2024 by sunzhq2
20 changed files
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/requirements.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/requirements.txt
+numpy==1.23.0
+protobuf==3.20.2
+torch==1.13.1
+tensorflow==2.7.2
+tf2onnx
+wrapt==1.14.1
+onnx==1.13.1
+bert-tensorflow==1.0.1
+opencv-python-headless
+torchvision
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/runtime_backend_ipu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/runtime_backend_ipu.py
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import numpy as np
+from poprt.runtime import PackAlgorithm, PackRunnerConfig, RuntimeConfig
+
+from general_perf.backends import runtime_backend
+
+from . import engine_poprt
+
+log = logging.getLogger("RuntimeBackendIPU")
+
+
+class RuntimeBackendIPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendIPU, self).__init__()
+        self.hardware_type = "IPU"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.pack_config = None
+        self.batch_size = -1
+        self.pack_bs = -1
+        self.packrunner = False
+        self.engine = None
+        self.runner_name = "POPRT"
+        self.compiled_dir = (
+            os.path.split(os.path.abspath(__file__))[0] + "/compiled_models"
+        )
+        self.precision = "fp32"
+
+    def predict(self, feeds, test_benchmark=False):
+        # apply samll adjustments to ipu results to align with cpu's
+        self._input_adjustment(feeds)
+        results = self.engine.predict(feeds)
+
+        if "videobert" in self.workload["model"]:
+            # open_cifar required outputs as: logits_per_image, logits_per_text
+            return results["3034"], results["3035"]
+        return results
+
+    def _get_engine(self, batch_size):
+        if not self.batch_size == batch_size:
+            self.update_packrunner_info()
+            self.batch_size = batch_size if not self.packrunner else 1
+
+            interact_info = self.configs.get("interact_info", {})
+            interact_info.get("runtime_options", {})
+
+            is_pack = interact_info.get("pack_config", False)
+            if not is_pack:
+                config = RuntimeConfig()
+            else:
+                config = PackRunnerConfig()
+                # set the time out to 0 since the test_accuracy.py does not support async: let the packing timeout in poprt asap
+                assert interact_info.get(
+                    "pack_config"
+                ), "pack mode requires 'pack_config'"
+                self.pack_config = interact_info["pack_config"]
+                assert (
+                    "dynamic_input_name" in self.pack_config
+                ), "you must specify the name of the input who has dynamic length."
+                assert (
+                    "mask_name" in self.pack_config
+                ), "you must specify the name of 'mask' input for pack runner."
+                assert (
+                    "input_names" in self.pack_config
+                ), "you must specify all input names for pack runner for auto padding removal."
+
+                mask_name = self.pack_config["mask_name"]
+
+                if self.configs["model"] == "deberta-torch-fp32":
+                    config.enable_input_single_row_mode(mask_name, "unpack_info", 1)
+                else:
+                    config.enable_input_single_row_mode(mask_name)
+                config.dynamic_input_name = self.pack_config["dynamic_input_name"]
+                config.timeout_microseconds = self.pack_config.get(
+                    "timeout_microseconds", 15000
+                )
+                # best performance mode
+                config.algorithm = PackAlgorithm.first_fit
+                config.max_valid_num = self.pack_config.get("max_pack_num", 40)
+                # remove user provided padded zeros in pack runner
+                config.enable_padding_remove_mode(
+                    self.pack_config["mask_name"],
+                    [n for n in self.pack_config["input_names"] if n != mask_name],
+                )
+
+            if self.runner_name == "POPRT":
+                if self.engine:
+                    del self.engine
+                self.engine = engine_poprt.PopRT(self.popef_path, config)
+            else:
+                raise ValueError("engine_name must be POPRT")
+        return self.engine
+
+    def benchmark(self, dataloader):
+        report = {}
+        report["BS"] = self.batch_size
+        interact_info = self.configs.get("interact_info", {})
+        if self.packrunner:
+            report["BS"] = self.pack_bs
+            iterations = self.workload["iterations"]
+            qps, avg_latency, tail_latency = self.engine.benchmark_pack(
+                interact_info["pack_config"], iterations
+            )
+
+        else:
+            iterations = self.workload["iterations"]
+            clients = interact_info.get("clients", 1)
+
+            qps, avg_latency, tail_latency = self.engine.benchmark(
+                clients, self.batch_size, iterations
+            )
+
+        report["QPS"] = int(qps)
+        report["AVG Latency"] = avg_latency
+        report["P99 Latency"] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        # return self.workload['batch_sizes'][0]
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        # synchronize configuration updates from compile backend
+        self.update_packrunner_info()
+        self.precision = (
+            self.configs.get("interact_info", {}).get("converter_options", {})
+            .get("precision", "FP32")
+            .upper()
+        )
+        if self.packrunner:
+            batch_size = self.pack_bs
+        self.popef_path = os.path.join(
+            self.compiled_dir,
+            self.configs["model"],
+            str(batch_size),
+            "executable_{}.popef".format(self.precision),
+        )
+        self._get_engine(batch_size)
+
+    def update_packrunner_info(self):
+        interact_info = self.configs.get("interact_info", {})
+        is_pack = interact_info.get("pack_config", False)
+        if not is_pack:
+            return
+        pack_config = interact_info["pack_config"]
+        if is_pack:
+            self.packrunner = True
+            self.pack_bs = pack_config["batch_size"]
+
+    def _input_adjustment(self, inputs):
+        # packing mode require "position_ids" for bert-like models
+        if self.packrunner:
+            seq_len = np.count_nonzero(inputs[self.pack_config["mask_name"]])
+            if self.configs["model"] == "roberta-torch-fp32":
+                inputs["position_ids"] = np.arange(seq_len, dtype=np.int32) + 1
+            elif self.configs["model"] in ("albert-torch-fp32", "bert-torch-fp32"):
+                inputs["position_ids"] = np.arange(seq_len, dtype=np.int32)
+            elif self.configs["model"] == "deberta-torch-fp32":
+                inputs["unpack_info"] = np.zeros(1, dtype=np.int32)
+                inputs.pop("token_type_ids")
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/README.md
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/README.md
+# Sparse Processing Unit (SPU)
+
+The inventor of the dual sparsity algorithm, Moffett AI has the world's leading sparse computing techniques with more
+than 30 patents worldwide. The company creates a new generation of AI computing platform with hardware and software
+co-design to achieve order-of-magnitude acceleration of computing performance, reducing latency and low TCO.
+
+The result in Byte MLPerf has demonstrated the potential of sparse computing in inference performance and energy
+efficiency, which leads to a lower total cost of ownership (TCO).
+
+For Byte MLPerf, Moffett has submitted performance results of the following models.
+
+| Model                | Precision | Sparsity* |QPS   | Dataset        | Metric name | Metric value | 
+|----------------------|-----------|----------|------|----------------|-------------|--------------|
+| resnet50-torch-fp32     | INT8      | 16x      | 52423 | Open Imagenet  | Top-1       | 76.61%       |
+| bert-torch-fp32         | INT8/BF16 | 16x      | 7738 | Open Squad 1.1 | F1 Score    | 86.09        |
+| albert-torch-fp32       | INT8/BF16 | 16x      | 10824 | Open Squad 1.1 | F1 Score    | 87.66        |
+| roberta-torch-fp32      | INT8/BF16 | 16x      | 8107 | Open Squad 1.1 | F1 Score    | 86.63        |
+| conformer-encoder-onnx-fp32 | INT8/BF16 | 8x       | 8211 | Fake Dataset   | Mean Diff   | 1.50       |
+
+\* The sparsity is determined by the ratio of time spent on Matmul operations compared to the overall time of model inference.
+
+Besides the performance results, energy efficiency is another significant highlight of Moffett's devices. For example,
+the peak power consumption of S30 is merely
+250W.
+
+The Antoum architecture through hardware and software co-design and Moffett's original sparsity algorithm are the
+reasons to achieve great performance with high energy efficiency.
+
+The accelerators for AI inference applications in data centers are equipped with Moffett's 1st generation Antoum
+processor - the first commercial AI processor with 32x sparsity in the world.
+
+Besides the sparse processing units (SPU) for native sparse convolution and matrix computing in Antoum, the processor
+also integrates a Vector Processing Unit (VPU), which enables flexible programmability to keep up with the fast
+evolution of AI models.
+
+Also, the on-chip Video Codec, which supports 192-way 1080p video decoding at 30 FPS, and the JPEG decoder, which
+supports 1080p image decoding up to 6960 FPS, provide an end-to-end capability for video and image inference workloads.
+
+Moffett provides three SKUs of sparse computing devices, namely S4, S10, and S30. Based on the computing power of S4,
+S10 and S30 are designed to be 2 times and 3 times the computing power of S4 respectively.
+
+## How to run
+### 1. Environmental preparation
+#### Download offline image
+```bash
+wget moffett-oss-bucket01.oss-cn-shenzhen.aliyuncs.com/byte-perf/byte-perf-2.3.2-20230721.tar
+```
+#### Load offline image
+```bash
+docker load -i byte-perf-2.3.2-20230721.tar
+```
+#### Decompress the model data package
+```bash
+wget moffett-oss-bucket01.oss-cn-shenzhen.aliyuncs.com/byte-perf/byte-perf-data.tar.gz
+tar -zxvf byte-perf-data.tar.gz
+```
+### 2. Create docker container
+notes: --shm-size="300g" is recommended to be 95% of the total memory of the host.
+```bash
+cd byte-perf-data
+sudo docker run -itd \
+    --privileged \
+    --cap-add=ALL \
+    --net=host \
+    -v /dev:/dev \
+    -v /usr/src:/usr/src \
+    -v /lib/modules:/lib/modules \
+    -v $PWD/package:/home/moffett/workspace/package \
+    -e ROOT_PASS=moffett \
+    --shm-size="300g" \
+    --name byte-perf-2023 \
+    byte-perf:2.3.2-20230721
+``` 
+### 3. Environment initialization 
+Environment initialization please operate in the container.
+```bash=
+docker exec -it byte-perf-2023 /bin/bash
+```
+#### Install drivers and load firmware
+```bash
+cd /usr/local/sola/driver/bin/
+sudo ./setup.sh
+```
+#### Device basic information verification 
+mf-smi is a command line utility that can view various information of S30, such as card number, usage, temperature, power consumption, etc.
+After the driver is successfully installed, execute mf-smi to view the basic information of the device.
+```bash
+mf-smi 
+```
+
+### 4. Run byte-mlperf task in container
+
+```bash=
+cd /home/moffett/workspace/package/bytemlperf
+
+# config spu-backend env 
+export PYTHONPATH=$PYTHONPATH:/home/moffett/workspace/spu-backend-release/ubuntu18.04-gcc7.5.0-x86_64/lib/
+
+# conformer
+python3 launch.py --task conformer-encoder-onnx-fp32 --hardware_type SPU
+
+# albert
+python3 launch.py --task albert-torch-fp32 --hardware_type SPU
+
+# bert
+python3 launch.py --task bert-torch-fp32 --hardware_type SPU
+
+# roberta
+python3 launch.py --task roberta-torch-fp32 --hardware_type SPU
+
+# resnet50
+python3 launch.py --task resnet50-torch-fp32 --hardware_type SPU
+```
+
+## Contact us
+
+If you are interested in further information about the products, please contact the email: sales@moffett.ai
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/base_compile.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/base_compile.py
+from torch.utils.data import DataLoader as DataLoaderX
+from dataset.dataset import ImageNetDataset,MZJBertDataset,DummyDataset
+from nn_compiler.common.constants import OpType
+from common_compile import SparsertBaseBuilder
+import onnx
+
+def get_onnx_input_info(onnx_model_path):
+    # Load ONNX model
+    model = onnx.load(onnx_model_path)
+
+    # Initialize an empty dictionary to store input names and shapes
+    input_info = {}
+
+    # Iterate through the inputs of the model
+    for input in model.graph.input:
+        input_name = input.name
+        input_shape = [dim.dim_value for dim in input.type.tensor_type.shape.dim]
+        input_info[input_name] = input_shape
+
+    return input_info
+
+def get_model_input_info(onnx_input_info,batch_size):
+    config_input_dict = {}
+    input_shape_dict = {}
+    for input_name,input_shape in onnx_input_info.items():
+        config_input_dict[input_name] = input_name
+        input_shape[0] = batch_size
+        input_shape_dict[input_name] = input_shape
+    return config_input_dict,input_shape_dict
+
+class Resnet50Builder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(Resnet50Builder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # calibration dataset config
+        dataset = ImageNetDataset(self.dataset_dir, transform_file=self.dataset_cfg)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size)
+        self.config.calib_batch = 1
+
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        
+        # you can also set other configs here
+        self.config.do_kl = True
+        self.config.opt_level = 8
+        self.config.total_cores = 1
+
+
+class BertBaseBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(BertBaseBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+
+class AlbertBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(AlbertBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+
+class RobertaBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(RobertaBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+class ConformerBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(ConformerBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = DummyDataset(input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+class GeneralBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(GeneralBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = DummyDataset(input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/compile_backend_spu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/compile_backend_spu.py
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import json
+import logging
+from typing import Any, Dict, List, Optional
+from general_perf.backends import compile_backend
+from base_compile import Resnet50Builder, BertBaseBuilder, AlbertBuilder, RobertaBuilder, ConformerBuilder, GeneralBuilder
+
+log = logging.getLogger("CompileBackendSPU")
+
+
+class CompileBackendSPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendSPU, self).__init__()
+        self.hardware_type = "SPU"
+        self.batch_size = None
+        self.model_name = ""
+        self.configs = None
+        self.workload = None
+        self.model_info = None
+        self.model = None
+        self.interact_info = None
+
+    def compile(self,
+                configs: Dict[str, Any],
+                dataloader=None) -> Dict[str, Any]:
+        """
+        Model compilation interface. Model conversion and compilation 
+        can be performed here. The model format can be changed here.
+        """
+        model_info = configs["model_info"]
+        name = model_info ['model']
+        builder_dict = {
+            "resnet50-torch-fp32": Resnet50Builder,
+            "bert-torch-fp32": BertBaseBuilder,
+            "albert-torch-fp32": AlbertBuilder,
+            "roberta-torch-fp32": RobertaBuilder,
+            "conformer-encoder-onnx-fp32": ConformerBuilder
+        }
+
+        if name in builder_dict:
+            SparserBuilder = builder_dict[name]
+        else:
+            SparserBuilder = GeneralBuilder
+        interact_info = self.get_interact_profile(configs)
+        onnx_path = interact_info["onnx_path"]
+        dump_dir=os.path.dirname(os.path.abspath(interact_info["model_path"]))
+        dataset_dir = interact_info["calibration_dir"]
+        dataset_cfg = interact_info["transform_file"]
+        model_precision = interact_info["model_precision"]
+        batch_size = interact_info["batch_size"]
+        verify = interact_info["verify"]
+
+        builder = SparserBuilder(onnx_path, dump_dir, dataset_dir, dataset_cfg, model_precision, batch_size, verify)
+        compile_info = builder()
+
+        result = {
+            "model": configs["model_info"]["model"],
+            "framework": configs["model_info"]["framework"],
+            "compile_precision": model_precision,
+            "input_type": configs["model_info"]["input_type"].split(","),
+            "max_batch_size": configs["workload"]["batch_sizes"][-1],
+            "compile_status":"success",
+            "sg_percent": 100,
+            "sparsity_ratio":compile_info["sparsity_ratio"],
+            "segments": [
+                {
+                    "sg_idx": 0,
+                    "is_fallback": False,
+                    "input_tensor_map": configs["model_info"]["input_shape"],
+                    "output_tensor_map": configs["model_info"]["outputs"],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": batch_size,
+                            "compiled_obj": dump_dir,
+                        },
+                    ],
+                },
+            ],
+            "interact_info": interact_info,
+        }
+
+ 
+        return result
+
+    def get_interact_profile(self, configs: Dict[str, Any]):
+
+        file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f"interact_info/{configs['model_info']['model']}.json")
+        if os.path.exists(file_path):
+            with  open(file_path, 'r') as f:
+                model_profile = json.load(f)
+            return model_profile
+        else:
+            log.info('File path: {} does not exist, please check'.format(file_path))
+            raise NotImplementedError("CompileBackend:get_interact_profile")
+
+    def get_best_batch_size(self) -> Optional[List[int]]:
+        """
+        Get Best Batch Size for the model
+        """
+        return [1]
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/albert-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/albert-torch-fp32.json
+{
+  "model": "albert-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/albert.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/albert-mf-int8/albert-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/albert-base-v2_squad.npy",
+  "transform_file": "",
+  "batch_size": 12,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/bert-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/bert-torch-fp32.json
+{
+  "model": "bert-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/bert-base.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/bert-mf-int8/bert-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/bert-base-uncased_squad.npy",
+  "transform_file": "",
+  "batch_size": 12,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/conformer-encoder-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/conformer-encoder-onnx-fp32.json
+{
+  "model": "conformer-encoder-onnx-fp32",
+  "onnx_path": "general_perf/download/moffett/models/conformer.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/conformer-mf-int8/conformer-mf-int8.zip",
+  "calibration_dir": " ",
+  "transform_file": "",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "none",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/resnet50-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/resnet50-torch-fp32.json
+{
+  "model": "resnet50-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/resnet50.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/resnet50-mf-int8/resnet50-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/imgnet_calibrate_data",
+  "transform_file": "general_perf/download/moffett/compiler_wrapper/resnet50-mf-int8/mxnet_imagenet_trans_224.json",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "INT8",
+  "dataset_name": "open_imagenet",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/roberta-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/interact_info/roberta-torch-fp32.json
+{
+  "model": "roberta-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/roberta.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/roberta-mf-int8/roberta-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/roberta-base_squad.npy",
+  "transform_file": "",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/requirements.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/requirements.txt
+matplotlib==3.4.2
+mypy-protobuf==2.8
+protobuf==3.19.6
+numpy==1.23.4
+pandas==1.5.1
+torch==1.9.1
+tensorflow==2.8.4
+bert-tensorflow==1.0.1
+torchvision==0.10.1
+sentencepiece==0.1.96
+tokenization
+tensorflow-datasets==4.7.0
+google-cloud-core==2.3.2
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/runtime_backend_spu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/runtime_backend_spu.py
+import logging
+import tensorflow as tf
+import torch
+import numpy as np
+import time
+import yaml
+import multiprocessing
+from multiprocessing import Manager
+from general_perf.backends import runtime_backend
+from inference import ModelFactory
+from threading import Thread
+
+hardware_type = "spu".upper()
+
+tf.get_logger().setLevel('ERROR')
+log = logging.getLogger(f"Backend-{hardware_type}")
+
+bfloat16 = tf.bfloat16.as_numpy_dtype
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+tf_dtype_map = {
+    "FLOAT32": tf.float32,
+    "FLOAT16": tf.float16,
+    "INT32": tf.int32,
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "INT8": np.int8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class RuntimeBackendSPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendSPU, self).__init__()
+        self.hardware_type = hardware_type
+        self.batch_size = None
+        self.input_rank = []
+        self.model_name = ""
+        self.current_batch_size = 4
+        self.dry_run = False
+        self.output_dtype = None
+        self.output_shape = None
+        self.configs = None
+        self.workload = None
+        self.model_info = None
+        self.model = None
+        self.order = None
+        self.yaml_config = None
+        self.need_reload = True
+        self.all_resnet50_start_time_list = []
+        self.all_resnet50_end_time_list = []
+
+    def predict(self, feeds):
+        input_name_list = self.configs['input_name']
+        if not self.model:
+            log.info("no model_runtime...")
+            self.load(self.get_loaded_batch_size())
+        if self.model_name == "resnet50-torch-fp32":
+            request = [feeds[name] for name in input_name_list]
+            response = self.model.inference(request)
+            return response
+        elif self.model_name == "conformer-encoder-onnx-fp32":
+            request = [feeds[name] for name in input_name_list]
+            response = self.model.inference(request)
+            return response
+        elif self.model_name in ["bert-torch-fp32", "albert-torch-fp32", "roberta-torch-fp32"]:
+            request, model_info = self.model.preprocess(feeds, self.yaml_config)
+            response = self.model.inference(request)
+            response = self.model.postprocess(response, model_info)
+            return response
+        else:
+            raise NotImplementedError(f"task: {self.model_name} not supported")
+
+    def callback_func(self):
+        end_time = time.time() * 1000
+        self.all_resnet50_end_time_list.append(end_time)
+
+    def benchmark(self, dataloader):
+        batch_sizes = self.workload['batch_sizes']
+        reports = []
+        iterations = self.workload['iterations']
+        for idx, batch_size in enumerate(batch_sizes):
+            if batch_size != self.batch_size:
+                continue
+            self.yaml_config.update(
+                {"min_batch_size": self.yaml_config['chunk_size']})
+            report = {}
+            qps = None
+            dataloader.rebatch(batch_size)
+            input_name_list = self.configs['input_name']
+            if self.model_name == "resnet50-torch-fp32":
+                test_data, _ = dataloader.get_samples(0)
+                all_resnet50_start_time_list = self.all_resnet50_start_time_list
+                all_resnet50_end_time_list = self.all_resnet50_end_time_list
+                self.model = ModelFactory(self.model_info)
+                model = self.model
+                model.load_model()
+                model.device_num = 3
+                request = [test_data[name] for name in input_name_list]
+                for _ in range(iterations):
+                    resnet50_start_time = time.time()
+                    all_resnet50_start_time_list.append(resnet50_start_time * 1000)
+                    output_data = self.model.inference(request, self.callback_func)
+                start_time_list = all_resnet50_start_time_list
+                end_time_list = all_resnet50_end_time_list
+
+            elif self.model_name == "conformer-encoder-onnx-fp32":
+                test_data = dataloader.get_samples(0)
+                all_conformer_start_time_list = []
+                all_conformer_end_time_list = []
+                self.model = ModelFactory(self.model_info)
+                model = self.model
+                model.load_model()
+                model.device_num = 3
+                request = [test_data[name] for name in input_name_list]
+                start = time.time()
+                for _ in range(iterations):
+                    conformer_start_time = time.time()
+                    all_conformer_start_time_list.append(conformer_start_time * 1000)
+                    output_data = self.model.inference(request)
+                    conformer_end_time = time.time()
+                    all_conformer_end_time_list.append(conformer_end_time * 1000)
+                start_time_list = all_conformer_start_time_list
+                end_time_list = all_conformer_end_time_list
+
+            elif self.model_name in ["bert-torch-fp32", "albert-torch-fp32", "roberta-torch-fp32"]:
+                test_data, _ = dataloader.get_samples(0)
+
+                def input_worker(_input_queue, data, iteration, shared_list):
+                    for i in range(iteration):
+                        batch_start_time = time.time()
+                        shared_list.append(batch_start_time)
+                        _input_queue.put(data)
+                    _input_queue.put(None)
+                    return
+
+                def preprocessing_worker(_input_queue, _preprocess_queue, _info_queue, model_info):
+                    while True:
+                        data = _input_queue.get()
+                        if data is None:
+                            _info_queue.put(None)
+                            _preprocess_queue.put(None)
+                            return
+
+                        input_data_list, info = self.model.preprocess(data, model_info)
+                        _preprocess_queue.put(input_data_list)
+                        _info_queue.put(info)
+
+                def inference_worker(_preprocess_queue, _inference_queue, config):
+                    self.model = ModelFactory(config)
+                    model = self.model
+                    model.load_model()
+                    model.device_num = 3
+                    while True:
+                        data = _preprocess_queue.get()
+                        if data is None:
+                            _inference_queue.put(None)
+                            model.destroy()
+                            return
+                        output_data = self.model.inference(data)
+                        _inference_queue.put(output_data)
+
+                def postprocessing_worker(_inference_queue, _postprocess_queue, _info_queue):
+                    while True:
+                        data = _inference_queue.get()
+                        info = _info_queue.get()
+                        if data is None:
+                            _postprocess_queue.put(None)
+                            _info_queue.put(None)
+                            return
+                        _postprocess_queue.put(self.model.postprocess(data, info))
+
+                def consumer(_postprocess_queue, shared_end_list):
+                    ans = []
+                    while True:
+                        i = _postprocess_queue.get()
+                        if i is None:
+                            return ans
+                        ans.append(i)
+                        batch_end_time = time.time()
+                        shared_end_list.append(batch_end_time)
+
+                manager = Manager()
+                shared_start_list = manager.list()
+                shared_end_list = manager.list()
+                # Inference Pipeline
+                input_queue = multiprocessing.JoinableQueue()
+                preprocess_queue = multiprocessing.JoinableQueue()
+                info_queue = multiprocessing.JoinableQueue()
+                inference_queue = multiprocessing.JoinableQueue()
+                postprocess_queue = multiprocessing.JoinableQueue()
+
+                # [0] 获取数据的进程
+                input_process = multiprocessing.Process(
+                    target=input_worker, args=(input_queue, test_data, iterations, shared_start_list))
+                # [1] 模型前处理进程
+                preprocessing_process = multiprocessing.Process(
+                    target=preprocessing_worker, args=(input_queue, preprocess_queue, info_queue, self.yaml_config))
+                # [2] 模型推理进程
+                inference_process = Thread(
+                    target=inference_worker, args=(preprocess_queue, inference_queue, self.yaml_config))
+
+                # [3] 模型后处理进程
+                postprocessing_process = multiprocessing.Process(
+                    target=postprocessing_worker, args=(inference_queue, postprocess_queue, info_queue))
+
+                # 开始计时
+                input_process.start()
+                preprocessing_process.start()
+                inference_process.start()
+                postprocessing_process.start()
+
+                processes = [input_process, preprocessing_process, inference_process, postprocessing_process]
+                responses = consumer(postprocess_queue, shared_end_list)
+                for p in processes:
+                    p.join()
+
+                start_time_list = list(shared_start_list)
+                end_time_list = list(shared_end_list)
+            else:
+                raise NotImplementedError(f"task: {self.model_name} not supported")
+
+            # 结束计时
+            all_latency = [(x - y) * 1000 if self.model_name != "resnet50-torch-fp32" else x - y for x, y in zip(end_time_list, start_time_list)]
+            all_latency.sort()
+            index = int(len(all_latency) * 0.99)
+            tail_latency = all_latency[index] / 1000
+            avg_latency = sum(all_latency) / len(all_latency) / iterations
+            if not qps:
+                qps = round(1000 * batch_size / avg_latency, 2)
+            tail_latency = round(tail_latency, 2)
+            avg_latency = round(avg_latency, 2)
+            qps = round(qps, 2)
+            log.info(
+                "\033[32m" + f"Report: Batch Size is {batch_size}, QPS is {qps}, AVG Latency is {avg_latency} ms, P99 Latency is {tail_latency} ms" + "\033[0m")
+            report['BS'] = batch_size
+            report['QPS'] = qps
+            report['AVG Latency'] = avg_latency
+            report['P99 Latency'] = tail_latency
+            print(f"AVG Latency:{avg_latency}, P99 Latency:{tail_latency}")
+            reports.append(report)
+        return reports
+
+    def get_loaded_batch_size(self):
+        # only used in accuracy mode, not in benchmark.
+        name = self.configs['model']
+        self.yaml_config.update(
+            {"min_batch_size": self.yaml_config['chunk_size']})
+        if "bert-torch-fp32" in name or "albert-torch-fp32" in name:
+            return 12
+        elif "roberta-torch-fp32" in name:
+            return 4
+        elif "resnet50-torch-fp32" in name:
+            return 16
+        elif "conformer-encoder-onnx-fp32" in name:
+            return 16
+        else:
+            raise NotImplementedError(f"task : {name} not supported")
+
+    def load(self, batch_size):
+        self.batch_size = batch_size
+        self.model_name = self.configs['model']
+        self.model_info.update({"input_name": self.model_info['inputs'].split(",")})
+        task_name = self.model_info["model"]
+        self.yaml_config = yaml.safe_load(open(f"./general_perf/download/moffett/converted_models/{task_name}.yaml", "r"))
+        self.yaml_config.update({
+            "model": self.configs["model"],
+            "input_name": self.model_info["input_name"],
+            "dataset_name": self.model_info['dataset_name']
+        })
+        if 'input_order' in self.yaml_config["model_input"][0]:
+            self.yaml_config.update(
+                {"input_order": [inp['input_order'] for inp in self.yaml_config["model_input"]]})
+
+        if self.need_reload:
+            self.model = ModelFactory(self.model_info)
+            self.model.load_model()
+            self.model.device_num = 1
+            self.need_reload = False
+        else:
+            log.info("model has been loaded, skip load process")
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/spu.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/SPU/spu.json
+[
+
+]
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/README.md
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/README.md
+<div align="center">
+  <img src="STC.jpg">
+</div>
+
+
+# Supported model inference results
+| Model name | QPS | Dataset | Metric name | Metric value |
+| :-----:| :----: | :----: | :----: | :----: |
+| albert-torch-fp32 | 824.49 | Open Squad 1.1 | F1 Score | 87.66 |
+| bert-tf-fp32 | 822.38 | Open Squad 1.1 | F1 Score | 86.45 |
+| bert-torch-fp32 | 813.86 | Open Squad 1.1 | F1 Score | 86.14 |
+| resnet50-tf-fp32 | 8725.94 | Open ImageNet | Top-1 | 77.24% |
+| robert-torch-fp32 | 800.7 | Open Squad 1.1 | F1 Score | 83.19 |
+| widedeep-tf-fp32 | 2395899.9 | Open Criteo Kaggle | Top-1 | 77.39% |
+
+
+For more detailed result information, see general_perf/reports/STC/. Models above are depolyed on a NPU (Neural-network Processing Unit) card "STCP920" which is designed and manufactured by Beijing Stream Computing Technology Co., LTD. Softwares associated with STCP920 are as following: 
+
+| Software | Version | Description |B
+| :-----:| :----: | :----: |
+| HPE | 1.5.1 | Heterogeneous Programming Environment |
+| TensorTurbo | 1.11.0 | An AI compiler for STCP920 developed based on TVM |
+| STC_DDk | 1.1.0 | Deploy Development Kits for STCP920, which includes AI Convertor, AI Executor, and utilities used in model conversion. |
+
+
+In addition, a variety of tools for monitoring status of NPU devices, debugging heterogeneous programs, and analyzing accuracy and performance of NPU programs are provieded.
+
+| Software  | Description |
+| :-----:| :----: |
+| stc-smi | Stream Computing System Management Interface for managing and monitoring NPU devices, including viewing device information and resource usage |
+| stc-gdb | Stream Computing Debugger for debugging heterogeneous NPU programs  |
+| stc-prof | Stream Computing Profiler, for performance analysis and optimization of heterogeneous programs  |
+| stc-hpaa | Stream Computing Half-Precision Accuracy Analysis, for locating the calculation error location and corresponding data  |
+
+
+For more detailed software information, please refer to: https://docs.streamcomputing.com/_/sharing/vSxLMI20nalGphdpXdEVoDg6JkUcfEkT?next=/zh/latest/
+
+# How to run
+1. Prepare environment  
+Prepare a machine with the STCP920 chip, install HPE, install -r general_perf/requirements.txt. Then create a virtual environment, install -r general_perf/backends/STC/requirements.txt, install Tensorturbo and STC_DDK. These installation packages can be obtained by visiting this link: https://docs.streamcomputing.com/_/sharing/vSxLMI20nalGphdpXdEVoDg6JkUcfEkT?next=/zh/latest/
+
+```bash
+export PYTHONPATH=$PYTHONPATH:ByteMLPerf:ByteMLPerf/general_perf/backends/STC
+```
+
+2. Prepare model and dataset  
+Run general_perf/prepare_model_and_dataset.sh to get model and dataset.
+
+3. Run 
+```bash
+python3 launch.py --tasks xxx --hardware_type STC  
+```
+--task parameter is the name of the incoming workload. You need to specify the workload. For example, if you would like to evaluate the workload: bert-tf-fp16.json, you need to specify --task bert-tf-fp16.
+
+
+# Company introduction
+Beijing Stream Computing Technology Co., LTD, is committed to providing cloud service manufacturers with high cost performance and high versatility of AI accelerated chips.
+
+The first-generation chip achieves 128 TFLOPS in semi-precision floating-point operations, twice as big as T4. At present, the first-generation NPU card 'STCP920' is in mass production, and has completed a batch of shipments to users. The second-generation products are in schedule and will be coming soon in 2023.
+
+# The technical specifications of the first-generation chip
+| Name  | Value |
+| :-----:| :----: |
+| AI Computation power | 128 TFLOPS @ FP16 |
+| Memory Type | LPDDR4X |
+| Memory | 16GB, 119.4GB/S |
+| Last Level Buffer | 8MB, 256GB/s |
+| Level 1 Buffer | 1.25MB, 512GB/s |
+| Host Interface | PCIe 4, 16x, 32GB/s, support Lane Reversal |
+| Thermal Design Power | 160W |
+| Structural Dimension | 268.44mm x 111.15mm, single slot |
+
+# What we have done
+We provide development kits to support converting any deep learning model into an stc engine deploying it on a CPU+NPU server.
+
+An AI compiler(TensorTurbo) is developed to convert certain part of a deep learning model into an NPU-executable file. The AI compiler employs a series of transformations and optimizations in the process of model conversion, to ensure better inference performance of the outcome.
+
+Using the associated softwares, we have supported over 150 open source models from four deep learning frameworks including tensorflow 1.x and 2.x, pytorch, onnx, paddlepaddle. The application fields include CV, NLP, recommendation, speech, OCR, multimodel. Most of the models achieve 2x inference performance compared to Nvidia GPU T4.
+
+
+# Contact us
+If you are interested in further information about the product, please contact the email: johnson@streamcomputing.com
+
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/STC.jpg
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/STC.jpg
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/STC.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/STC.json
+{
+    "albert-torch-fp32": {
+        "best_batch": 4
+    },
+    "bert-tf-fp32": {
+        "best_batch": 2
+    },
+    "bert-torch-fp32": {
+        "best_batch": 2
+    },
+    "resnet50-tf-fp32": {
+        "best_batch": 16
+    },
+    "robert-torch-fp32": {
+        "best_batch": 8
+    },
+    "widedeep-tf-fp32": {
+        "best_batch": 2048
+    }
+}
+
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/compile_backend_stc.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/compile_backend_stc.py
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/requirements.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/requirements.txt
+tensorflow==1.15.0
+protobuf==3.19.4
+decorator
+graphviz==0.8.4
+scipy
+attrs==21.2.0
+pyyaml
+synr==0.4
+multipledispatch
+pytest
+matplotlib==3.3.4
+pulp
+scikit-learn
+torchvision==0.13.1
+torch==1.12.1
+onnx==1.12.0
+onnxconverter-common==1.12.2
+onnxruntime==1.12.0
+pybind11>=2.9.1
+bert-tensorflow==1.0.1
+tqdm
+psutil
+pydot
+sentencepiece==0.1.96
+virtualenv==16.7.9
+keras2onnx
+pybind11
+setuptools==67.5
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/runtime_backend_stc.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/STC/runtime_backend_stc.py
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py