init

24b257f1 · sunzhq2 · 920b3c0f · 24b257f1 · 24b257f1 · 24b257f1
Commit 24b257f1 authored Nov 19, 2024 by sunzhq2
20 changed files
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/albert-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/albert-torch-fp32.json
+{
+  "model": "albert-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/albert.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/albert-mf-int8/albert-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/albert-base-v2_squad.npy",
+  "transform_file": "",
+  "batch_size": 12,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/bert-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/bert-torch-fp32.json
+{
+  "model": "bert-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/bert-base.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/bert-mf-int8/bert-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/bert-base-uncased_squad.npy",
+  "transform_file": "",
+  "batch_size": 12,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/conformer-encoder-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/conformer-encoder-onnx-fp32.json
+{
+  "model": "conformer-encoder-onnx-fp32",
+  "onnx_path": "general_perf/download/moffett/models/conformer.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/conformer-mf-int8/conformer-mf-int8.zip",
+  "calibration_dir": " ",
+  "transform_file": "",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "none",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/resnet50-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/resnet50-torch-fp32.json
+{
+  "model": "resnet50-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/resnet50.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/resnet50-mf-int8/resnet50-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/imgnet_calibrate_data",
+  "transform_file": "general_perf/download/moffett/compiler_wrapper/resnet50-mf-int8/mxnet_imagenet_trans_224.json",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "INT8",
+  "dataset_name": "open_imagenet",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/roberta-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/interact_info/roberta-torch-fp32.json
+{
+  "model": "roberta-torch-fp32",
+  "onnx_path": "general_perf/download/moffett/models/roberta.onnx",
+  "model_path": "general_perf/download/moffett/converted_models/roberta-mf-int8/roberta-mf-int8.zip",
+  "calibration_dir": "general_perf/download/moffett/datasets/roberta-base_squad.npy",
+  "transform_file": "",
+  "batch_size": 4,
+  "verify": false,
+  "model_precision": "MixInt8bf16",
+  "dataset_name": "open_squad",
+  "framework": "sparsert"
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/runtime_backend_dcu-migraphx.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/runtime_backend_dcu-migraphx.py
+import os
+import json
+import logging
+
+import torch._tensor
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import migraphx
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+import onnx
+
+from onnx import shape_inference
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendDCU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "INT8": np.int8,
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "FLOAT16": np.float16,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class RuntimeBackendDCU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendDCU, self).__init__()
+        self.hardware_type = 'DCU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+    
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Tensorflow":
+            entry_rt = self.model_runtimes[0].signatures['serving_default']
+            all_sn_inputs = entry_rt.structured_input_signature
+
+            def get_real_feeds(feeds, sn_inputs):
+                sn_inputs = tf.nest.flatten(sn_inputs, True)
+                real_feeds = {}
+                itr = 0
+                for _, val in feeds.items():
+                    real_feeds[sn_inputs[itr].name] = tf.constant(val)
+                    itr += 1
+                return real_feeds
+
+            real_feeds = get_real_feeds(feeds, all_sn_inputs)
+            start_time = time.time()
+            for model_runtime in self.model_runtimes:
+
+                with tf.device('GPU'):    
+                    _results = model_runtime.signatures['serving_default'](
+                        **real_feeds)
+            end_time = time.time()
+            use_time = end_time - start_time
+            results = {}
+            for key, val in _results.items():
+                results[key] = val.numpy()
+
+            assert len(results) != 0
+
+        elif self.framework == "Pytorch":
+
+            input_tensors = []
+            new_input_type = self.input_type.split(',')
+
+            i = 0
+
+            for key, _ in feeds.items():
+                input_tensors.append(
+                    torch.tensor(feeds[key],
+                                 dtype=pt_dtype_map[new_input_type[i]]).to(
+                                     self.device))
+                i += 1
+            start_time = time.time()
+            if self.configs['compile_precision'] == "FP16" and self.configs['model'].find("bert") != -1:
+                with torch.no_grad(), torch.cuda.amp.autocast():
+
+                    for model_runtime in self.model_runtimes:
+                        results = model_runtime(*input_tensors)
+            else:
+                with torch.no_grad():
+
+                    for model_runtime in self.model_runtimes:
+                        results = model_runtime(*input_tensors)
+            end_time = time.time()
+            use_time = end_time - start_time
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.cpu().numpy()}
+        elif self.framework == "Migraphx":
+            for model_runtime in self.model_runtimes:
+                modelData = self.AllocateteOutputMemory(model_runtime)
+                for key, _ in feeds.items():
+                    feeds[key] = np.array(feeds[key])
+                    modelData[key] = migraphx.to_gpu(migraphx.argument(feeds[key]))
+                
+                start_time = time.time()
+                
+                results_migraphx = model_runtime.run(modelData)
+                end_time = time.time()
+                use_time = end_time - start_time
+                results = []
+                for i in range(len(results_migraphx)):
+                    result = np.array(results_migraphx[i])
+                    results.append(result)
+
+        else:
+            for model_runtime in self.model_runtimes:   
+                start_time = time.time()           
+                results = model_runtime.run(None, feeds)
+                end_time = time.time()
+                use_time = end_time - start_time
+        return use_time, results
+        # return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        time_range = []
+        report = {}
+        report['BS'] = batch_size
+
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            use_time,_ = self.predict(test_data)
+            end_time = time.time()
+            times_range.append(use_time)
+            time_range.append(batch_size / use_time)
+            # times_range.append(end_time - start_time)
+        
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+    
+
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Tensorflow":
+
+
+                '''
+                判断需要的模型精度，并对之进行相应的转换操作
+                '''
+
+                if self.configs['compile_precision'] == "FP16":
+
+                    with tf.device('GPU'):
+                        model = tf.saved_model.load(
+                            segment['compiled_model'][0]['compiled_obj'])
+                        
+                    for var in model.variables:
+                        var.assign(tf.cast(var,tf.float16))
+
+                if self.configs['compile_precision'] == "INT8":
+                    
+                    with tf.device('GPU'):
+                        model = tf.saved_model.load(
+                            segment['compiled_model'][0]['compiled_obj'])
+                    for var in model.variables:
+                        var.assign(tf.cast(var,tf.int8))
+
+                if self.configs['compile_precision'] == "FP32":
+                    
+                    with tf.device('GPU'):
+                        model = tf.saved_model.load(
+                            segment['compiled_model'][0]['compiled_obj'])
+                        
+            elif self.framework == "Pytorch":
+                self.device = "cuda"
+
+                '''
+                判断模型是否为bert,如果是需要禁用torch.jit.fuser
+                '''
+
+                if self.configs['model'].find("bert") != -1:
+
+                    torch._C._jit_set_texpr_fuser_enabled(False)
+
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    torch.device('cuda'))
+                
+                if self.configs['compile_precision'] == "FP16":
+                    if self.configs['model'].find("bert") != -1:
+                        scaler = torch.cuda.amp.GradScaler()
+                    model = model.half()
+                model.eval()
+
+            elif self.framework == "Migraphx":
+                self.device = "cuda"
+                if self.configs['model'] == 'bert-migraphx-fp16':
+                    model = migraphx.load(segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.mrx')
+                else:
+                    model = migraphx.parse_onnx(segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.onnx')
+                    if self.configs['compile_precision'] == "INT8":
+                        print("=======================INT8====================")
+
+                        dic = dict()
+                        fake_data = self._get_fake_samples(batch_size, self.configs['segments'][0]['input_tensor_map'], self.configs['input_type'])
+                        for key,_ in fake_data.items():
+                            dic[key] = migraphx.argument(fake_data[key])
+                        calibration = [dic]
+                        migraphx.quantize_int8(model, migraphx.get_target("gpu"), calibration)
+
+                    model.compile(migraphx.get_target("gpu"),offload_copy=False,device_id=0) 
+                                                
+            else:
+
+                enable_tag = 'false'
+                if self.configs['compile_precision'] == 'FP16':
+                    enable_tag = 'true'
+                    
+                if self.configs['model'].find("resnet50") != -1:
+                    if self.configs['compile_precision'] == 'INT8':
+                        providers = ['ROCMExecutionProvider']
+                    else:
+                        providers = ['MIGraphXExecutionProvider']
+                    # provider_options=[{'device_id': '0','migraphx_fp16_enable':enable_tag,'dynamic_model':'true','migraphx_profile_max_shapes':'input_1.1:256x3x224x224'}]
+                    provider_options=[{'device_id': '0'}]
+
+                else:
+                    providers=['ROCMExecutionProvider']
+                    # provider_options=None
+                    provider_options=[{'device_id': '0'}]
+
+                # model = onnxruntime.InferenceSession(
+                #     segment['compiled_model'][0]['compiled_obj'],
+                #     providers=providers,provider_options=provider_options)
+                model = onnxruntime.InferenceSession(
+                    segment['compiled_model'][0]['compiled_obj'] + f'-{self.batch_size}.onnx',
+                    providers=providers,provider_options=provider_options)
+                    
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+
+            new_input_type = self.input_type.split(',')
+            for key, val in shape.items():
+
+
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[new_input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[new_input_type[i]])
+
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
+    
+    def AllocateteOutputMemory(self, model):
+        outputData={}
+        for key in model.get_outputs().keys():
+            outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
+
+        return outputData
+    
+    def GetMIGraphXType(self, type):
+        typeMap = {
+            'double_type': np.float64,
+            'float_type': np.float32,
+            'half_type': np.half,
+            'int64_type': np.int64,
+            'uint64_type': np.uint64,
+            'int32_type': np.int32,
+            'uint32_type': np.uint32,
+            'int16_type': np.int16,
+            'uint16_type': np.uint16,
+            'int8_type': np.int8,
+            'uint8_type': np.uint8,
+            'bool_type': bool
+        }
+        return typeMap[type]
+    
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/runtime_backend_dcu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/runtime_backend_dcu.py
+import os
+import json
+import logging
+
+import torch
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+
+import numpy as np
+
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendDCU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "INT8": np.int8,
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "FLOAT16": np.float16,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+class RuntimeBackendDCU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendDCU, self).__init__()
+        self.hardware_type = 'DCU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Tensorflow":
+            entry_rt = self.model_runtimes[0].signatures['serving_default']
+            all_sn_inputs = entry_rt.structured_input_signature
+
+            def get_real_feeds(feeds, sn_inputs):
+                sn_inputs = tf.nest.flatten(sn_inputs, True)
+                real_feeds = {}               
+                itr = 0
+                for _, val in feeds.items():
+                    real_feeds[sn_inputs[itr].name] = tf.constant(val)
+                    itr += 1
+                return real_feeds
+            real_feeds = get_real_feeds(feeds, all_sn_inputs)
+
+            for model_runtime in self.model_runtimes:
+
+                with tf.device('GPU'):    
+                    _results = model_runtime.signatures['serving_default'](
+                        **real_feeds)
+
+            results = {}
+            for key, val in _results.items():
+                results[key] = val.numpy()
+
+            assert len(results) != 0
+
+        elif self.framework == "Pytorch":
+            input_tensors = []
+            new_input_type = self.input_type.split(',')
+      
+            i = 0
+
+            for key, _ in feeds.items():
+                input_tensors.append(
+                    torch.tensor(feeds[key], 
+                                 dtype=pt_dtype_map[new_input_type[i]]).to(
+                                     self.device))
+                i += 1
+                
+
+            if self.configs["model"] == "bert-torch-fp16":
+                with torch.cuda.amp.autocast():
+                    with torch.no_grad():
+                        for model_runtime in self.model_runtimes:
+                            results = model_runtime(*input_tensors)
+            
+            else:
+                with torch.no_grad():
+                    for model_runtime in self.model_runtimes:
+                        results = model_runtime(*input_tensors)
+            
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.cpu().numpy()}           
+        else:
+            for model_runtime in self.model_runtimes:              
+                if self.configs["model"] == "resnet50-onnxruntime-fp16":
+                    feeds["input_1.1"] = feeds["input_1.1"].astype("float16")
+                results = model_runtime.run(None, feeds)
+        return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        report = {}
+        report['BS'] = batch_size
+
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Tensorflow":
+
+                with tf.device('GPU'):
+                    model = tf.saved_model.load(
+                        segment['compiled_model'][0]['compiled_obj'])
+                
+                if self.configs['compile_precision'] == "FP16":
+                    # 将所有变量转换为 float16
+                    for var in model.variables:
+                        var.assign(tf.cast(var,tf.float16))         
+                        
+            elif self.framework == "Pytorch":
+                self.device = "cuda"
+    
+                if self.configs["model"].split("-")[0] == "bert" or self.configs["model"].split("-")[0] == "roberta":
+                    # torch.jit.fuser('off')
+                    torch._C._jit_override_can_fuse_on_cpu(False)
+                    torch._C._jit_override_can_fuse_on_gpu(False)
+                    # torch._C._jit_set_texpr_fuser_enabled(False)
+                    # torch._C._jit_set_nvfuser_enabled(False)
+                    # https://github.com/pytorch/pytorch/issues/62962
+                
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    torch.device('cuda'))
+                
+                if self.configs['compile_precision'] == "FP16":
+                    model = model.half()
+                model.eval()
+            
+                                               
+            else:
+                # import pdb
+                # pdb.set_trace()
+                providers = [ 
+                    ('ROCMExecutionProvider', {
+                        'device_id': 0,
+                        'arena_extend_strategy': 'kNextPowerOfTwo',
+                        # 'cudnn_conv_algo_search': 'EXHAUSTIVE',
+                        'do_copy_in_default_stream': True,
+                    }),
+                ]
+                
+                # # 启用 FP16
+                options = onnxruntime.SessionOptions()
+                # options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+                # options.intra_op_num_threads = 1
+                # options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+                # options.enable_cuda_graph = True  # 如果你的硬件支持 CUDA Graph
+                # options.add_session_config_entry("session.set_denormal_as_zero", "1")
+                
+                if self.configs['compile_precision'] == "FP16":
+                    options.add_session_config_entry("session.enable_fp16", "1")  # 启用 FP16
+                model = onnxruntime.InferenceSession(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    providers=providers,
+                    sess_options=options)
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            new_input_type = self.input_type.split(',')
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[new_input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[new_input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/HPU.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/HPU.json
+[
+]
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/README.md
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/README.md
+<div align="center">
+  <img src="habana-white_intel_logo.png">
+</div>
+
+
+<!-- @import "[TOC]" {cmd="toc" depthFrom=1 depthTo=6 orderedList=false} -->
+
+<!-- code_chunk_output -->
+
+- [Habana](#habana)
+  - [Product Specs](#product-specs)
+- [Models supported](#models-supported)
+- [How to run](#how-to-run)
+    - [1. Create docker container](#1-create-docker-container)
+    - [2. Environment initialization](#2-environment-initialization)
+    - [3. Device basic information verification](#3-device-basic-information-verification)
+    - [4.Run byte-mlperf task](#4run-byte-mlperf-task)
+
+<!-- /code_chunk_output -->
+
+
+# Habana
+
+As enterprises and organizations look to seize the growing advantages of AI, the time has never been better for AI compute that’s faster yet efficient. Efficient on cost, power, and your time and resources. That’s why you’ll want to give Habana Gaudi processors a try.The Gaudi acceleration platform was conceived and architected to address training and inference demands of large-scale era AI, providing enterprises and organizations with high-performance, high-efficiency deep learning compute.
+
+## Product Specs
+
+- Gaudi
+
+With Habana’s first-generation Gaudi deep learning processor, customers benefit from the most cost-effective, high-performance training and inference alternative to comparable GPUs. This is the deep learning architecture that enables AWS to deliver up to 40% better price/performance training with its Gaudi-based DL1 instances—as compared to comparable Nvidia GPU-based instances. Gaudi’s efficient architecture also enables Supermicro to provide customers with equally significant price performance advantage over GPU-based servers with the Supermicro X12 Gaudi Training Server.
+
+<div align="center">
+  <img src="gaudi.png">
+</div>
+
+- Gaudi2
+
+Our Gaudi2 accelerator is driving improved deep learning price-performance
+and operational efficiency for training and running state-of-the-art models, from the largest language and multi-modal models to more basic computer vision and NLP models. Designed for efficient scalability—whether in the cloud or in your data center, Gaudi2 brings the AI industry the choice it needs—now more than ever.
+
+<div align="center">
+  <img src="gaudi2.png">
+</div>
+
+# Models supported
+
+| Model name |  Precision | QPS | Dataset | Metric name | Metric value | report |
+| ---- | ---- | ---- | ---- | ---- | ---- | ---- |
+| bert-torch-fp32 | BF16 | 1970 | Open Squad 1.1 | F1 Score | 85.8827 | [report](../../reports/HPU/bert-torch-fp32/) |
+| albert-torch-fp32 | BF16 | 2030 | Open Squad 1.1 | F1 Score | 87.66915 | [report](../../reports/HPU/albert-torch-fp32/) |
+| deberta-torch-fp32 | BF16 | 1970 | Open Squad 1.1 | F1 Score | 81.33603 | [report](../../reports/HPU/deberta-torch-fp32/) |
+| resnet50-torch-fp32 | BF16 | 8279 | Open ImageNet | Top-1 | 0.7674 | [report](../../reports/HPU/resnet50-torch-fp32/) |
+|  swin-large-torch-fp32 | BF16 |341 | Open ImageNet | Top-1 | 0.855 | [report](../../reports/HPU/swin-large-torch-fp32/) |
+
+# How to run
+
+### 1. Create docker container
+
+```bash
+docker run -itd --name test --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host   vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+```
+### 2. Environment initialization
+Environment initialization please operate in the container.
+```bash=
+docker exec -it test /bin/bash
+```
+### 3. Device basic information verification
+hl-smi is a command line utility that can view various information of Gaudi, such as card number, usage, temperature, power consumption, etc.
+After the driver is successfully installed, execute hl-smi to view the basic information of the device.
+```bash
+hl-smi
+```
+
+### 4.Run byte-mlperf task
+
+For example,
+
+```bash
+python launch.py --task bert-torch-fp32 --hardware_type HPU
+```
+
+For more information of the command to run the task, please refer to [ByteMLPerf](../../../README.md#usage).
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/bert/bf16.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/bert/bf16.txt
+add
+addmm
+bmm
+dropout
+gelu
+iadd
+linear
+matmul
+mm
+softmax
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/bert/fp32.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/bert/fp32.txt
+embedding
+cross_entropy
+nll_loss
+log_softmax
+truediv
+div
+layer_norm
+rsub
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/compile_backend_hpu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/compile_backend_hpu.py
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import torch
+import time
+import numpy as np
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendHPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class CompileBackendHPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendHPU, self).__init__()
+        self.hardware_type = 'HPU'
+        self.need_reload = False
+        self.model_runtimes = []
+
+    def _update_model_env(self):
+        if self.model_info["model"] in ("bert-torch-fp32", "albert-torch-fp32"):
+            os.environ['LOWER_LIST'] ='general_perf/backends/HPU/bert/bf16.txt'
+            os.environ['FP32_LIST'] ='general_perf/backends/HPU/bert/fp32.txt'
+
+    def compile(self, config, dataloader=None):
+        result = {
+            "model": config['model_info']['model'],
+            "framework": config['model_info']['framework'],
+            "compile_precision": "BF16",
+            "optimizations":{},
+            "instance_count": 1,
+            "device_count": 1,
+            "input_type": config['model_info']['input_type'].split(","),
+            "max_batch_size": config['model_info']['max_batch_size'],
+            "compile_status": "success",
+            "sg_percent": 100,
+            "segments": [
+                {
+                    "sg_idx":
+                    0,
+                    "is_fallback": False,
+                    "input_tensor_map": config['model_info']['input_shape'],
+                    "output_tensor_map": config['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": config['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ]
+        }
+        self.configs = result
+        self.workload = config['workload']
+        self.model_info = config['model_info']
+        self._update_model_env()
+        return result
+
+    def get_interact_profile(self, config):
+        model_profile = []
+        file_path = "general_perf/backends/HPU/" + self.hardware_type + '.json'
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                model_profile = json.load(f)
+        else:
+            log.info(
+                'File path: {} does not exist, please check'.format(file_path))
+
+        return model_profile
+
+    def get_best_batch_size(self):
+        """
+        Get Best Batch Size for the model
+        """
+        return None
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/gaudi.png
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/gaudi.png
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/gaudi2.png
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/gaudi2.png
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/habana-white_intel_logo.png
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/habana-white_intel_logo.png
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/requirements.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/requirements.txt
+transformers
+tokenization
+torchvision
+numpy
+tensorflow
+bert
+bert-tensorflow==1.0.1
+sentencepiece
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/runtime_backend_hpu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/HPU/runtime_backend_hpu.py
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import torch
+import time
+import numpy as np
+from threading import Thread
+
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendHPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+class RuntimeBackendHPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendHPU, self).__init__()
+        self.hardware_type = 'HPU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Pytorch":
+            input_tensors = []
+            i = 0
+            for key, _ in feeds.items():
+                if self.input_type[i] == "FLOAT32":
+                    datatype = torch.bfloat16
+                else:
+                    datatype = pt_dtype_map[self.input_type[i]]
+                input_tensors.append(
+                    torch.tensor(feeds[key],
+                                 dtype=datatype).to(
+                                     self.device,non_blocking=True))
+                i += 1
+
+            import habana_frameworks.torch.core as htcore
+            with torch.no_grad(), torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=True):
+                for model_runtime in self.model_runtimes:
+                    results = model_runtime(*input_tensors)
+                    htcore.mark_step()
+                    htcore.hpu.default_stream().synchronize()
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.float().cpu().detach().numpy() if val.dtype==torch.bfloat16 else val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.float().cpu().numpy() if results.dtype==torch.bfloat16 else results.cpu().numpy()}
+        else:
+            print("Just test pytorch for now.")
+        return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        report = {}
+        report['BS'] = batch_size
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+        enable_profile = False
+        if enable_profile:
+            warmup_steps = 2
+            active_steps = 5
+            import habana_frameworks.torch.core as htcore
+            prof = torch.profiler.profile(
+                   activities=(torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.HPU),
+                       schedule=torch.profiler.schedule(wait=0, warmup=warmup_steps, active=active_steps, repeat=1),
+                       on_trace_ready=torch.profiler.tensorboard_trace_handler('./profile/'),
+                       record_shapes=False,
+                       with_stack=True)
+
+
+        for _ in range(30):
+            self.predict(test_data)
+        if enable_profile:
+            prof.start()
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+            if enable_profile:
+                prof.step()
+        if enable_profile:
+            prof.stop()
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        # start_time = time.time()
+        # threads = []
+        # for i in range(iterations):
+            # with torch.hpu.stream(torch.hpu.Stream()):
+                # threads.append(Thread(target=self.predict, args=(test_data,)))
+                # threads[i].start()
+        # for t in threads:
+            # t.join()
+        # end_time = time.time()
+
+        # qps = int(1000.0 * batch_size * iterations / (end_time-start_time))
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+
+        import habana_frameworks.torch.core as htcore
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Pytorch":
+                self.device = torch.device('hpu')
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj']).to(self.device)
+                model.to(torch.bfloat16)
+                model.eval()
+                from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+                model = wrap_in_hpu_graph(model)
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/.gitignore
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/.gitignore
+compiled_models/
+pre_optimized_models/
+__pycache__/
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/Graphcore-Chinese-Wordmark-Horizontal.svg
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/Graphcore-Chinese-Wordmark-Horizontal.svg
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 797.92 157.03"><defs><style>.cls-1{fill:#292c31;}</style></defs><path class="cls-1" d="M570.12,83.64V72.08a.31.31,0,0,0-.31-.31H546.17a.33.33,0,0,1-.33-.33h0V57a.33.33,0,0,0-.33-.33H534a.33.33,0,0,0-.33.33h0V98.8a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V84.22a.32.32,0,0,1,.3-.33h23.67A.31.31,0,0,0,570.12,83.64Z"/><rect class="cls-1" x="545.84" y="45.52" width="30.34" height="11.13" rx="0.33"/><rect class="cls-1" x="545.84" y="99.13" width="30.34" height="12.14" rx="0.33"/><path class="cls-1" d="M527.67,111.25a.33.33,0,0,0,.3-.48l-13.62-27a.31.31,0,0,1,.13-.42l0,0c6.4-2.54,10.09-9.74,10.09-17,0-17.5-13.4-20.76-18.89-20.76H478.81a.34.34,0,0,0-.34.34h0v65.08a.34.34,0,0,0,.34.33H490.1a.33.33,0,0,0,.33-.33h0V87.68a.33.33,0,0,1,.33-.33h11.37a.31.31,0,0,1,.29.19l11.68,23.53a.33.33,0,0,0,.3.18Zm-26-35.85h-10.9a.33.33,0,0,1-.33-.34h0V57.8a.32.32,0,0,1,.33-.33h12.91c6.95,0,8.87,5.39,8.87,9.16C512.55,72.6,508.3,75.4,501.65,75.4Z"/><path class="cls-1" d="M437.92,44.28l-19.87,6.45a.43.43,0,0,0-.17.12L405.6,67.76a.31.31,0,0,0-.06.19V88.84a.36.36,0,0,0,.06.2l12.28,16.9a.3.3,0,0,0,.17.12l19.87,6.46a.34.34,0,0,0,.2,0L458,106.06a.28.28,0,0,0,.16-.12L470.43,89a.31.31,0,0,0,.07-.2V68a.26.26,0,0,0-.07-.19L458.15,50.85a.4.4,0,0,0-.16-.12l-19.87-6.45A.34.34,0,0,0,437.92,44.28Zm21,41L451,96.15a.31.31,0,0,1-.16.12l-12.76,4.15a.34.34,0,0,1-.2,0l-12.76-4.15a.3.3,0,0,1-.17-.12L417.11,85.3a.37.37,0,0,1-.07-.2V71.69a.32.32,0,0,1,.07-.19L425,60.64a.43.43,0,0,1,.17-.12l12.76-4.14a.34.34,0,0,1,.2,0l12.76,4.14a.48.48,0,0,1,.16.12l7.89,10.86a.31.31,0,0,1,.06.19V85.1A.36.36,0,0,1,458.93,85.3Z"/><path class="cls-1" d="M394.9,92a.35.35,0,0,0-.47,0c-3.08,3-7.28,8.26-15.68,8.26-11.75,0-20.85-7.65-20.85-21.73,0-14.6,8.82-22,20.85-22a22,22,0,0,1,15.68,7,.32.32,0,0,0,.45,0l0,0,7.7-8.36a.27.27,0,0,0,0-.36,34.12,34.12,0,0,0-24.54-10.57c-19.67,0-32.85,14.37-32.85,34.27,0,21.47,16.65,34.07,33.11,34.07,10.54,0,19.82-6.61,24.66-11.71a.29.29,0,0,0,0-.37Z"/><path class="cls-1" d="M324.61,71.83H295.94a.33.33,0,0,1-.33-.33h0V45.85a.33.33,0,0,0-.33-.33H283.8a.33.33,0,0,0-.33.33h0v65.09a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V84.3a.33.33,0,0,1,.33-.33h28.67a.33.33,0,0,1,.33.33h0v26.64a.33.33,0,0,0,.33.33h11.48a.33.33,0,0,0,.33-.33h0V45.85a.33.33,0,0,0-.33-.33H325.27a.33.33,0,0,0-.33.33h0V71.49a.32.32,0,0,1-.31.34Z"/><path class="cls-1" d="M183.32,45.72,163.2,88.81a.31.31,0,0,0,0,.13v22a.33.33,0,0,0,.33.33H175a.33.33,0,0,0,.33-.33V93.39a.33.33,0,0,1,.33-.33h29.69a.33.33,0,0,1,.33.33v17.55a.33.33,0,0,0,.33.33h11.47a.33.33,0,0,0,.33-.33V89a.4.4,0,0,0,0-.15L197.2,45.71a.35.35,0,0,0-.3-.19H183.62A.33.33,0,0,0,183.32,45.72Zm-3.1,34.74L190,59.36a.33.33,0,0,1,.6,0l9.74,21.1a.33.33,0,0,1-.3.47H180.52A.33.33,0,0,1,180.22,80.46Z"/><path class="cls-1" d="M151.65,79.29s0,0,.06,0l4.14-12.75a.34.34,0,0,0,0-.2l-4.13-12.7a.3.3,0,0,0-.12-.17l-11-7.83a.31.31,0,0,0-.19-.06l-32.2,0a.34.34,0,0,0-.33.34v65.08a.33.33,0,0,0,.33.33h10.33a.33.33,0,0,0,.33-.33V87.69a.33.33,0,0,1,.33-.33l12.42-.14a.33.33,0,0,1,.3.19l11.73,23.68a.33.33,0,0,0,.3.18h11.92a.33.33,0,0,0,.3-.48L143.45,85.55a.34.34,0,0,1,.1-.42Zm-32.77-3.64V56.58a.33.33,0,0,1,.33-.33h17.73a.31.31,0,0,1,.19.06l5,3.61a.31.31,0,0,1,.12.16l1.9,5.85a.34.34,0,0,1,0,.2L142.21,72a.3.3,0,0,1-.12.17l-5.36,3.77a.35.35,0,0,1-.19.06H119.21A.33.33,0,0,1,118.88,75.65Z"/><path class="cls-1" d="M84.85,107.77v3.17a.33.33,0,0,0,.33.33H96.66a.33.33,0,0,0,.33-.33V75.19a.33.33,0,0,0-.33-.33H70a.33.33,0,0,0-.33.33V86.66A.33.33,0,0,0,70,87H84.52a.33.33,0,0,1,.33.33v6.49a.34.34,0,0,1-.16.29L75.5,99.72a.33.33,0,0,1-.17,0h-10a.33.33,0,0,1-.17,0l-8.89-5.44a.43.43,0,0,1-.11-.12L50.8,84.87a.29.29,0,0,1,0-.16V72.09a.33.33,0,0,1,0-.17l5.35-9.29a.43.43,0,0,1,.11-.12l8.89-5.43a.34.34,0,0,1,.17,0H75.85a.37.37,0,0,1,.18,0l8.09,5.31.08.06,3.59,4.37a.34.34,0,0,0,.46.05L97,60a.32.32,0,0,0,.06-.46l-4.2-5.38a.22.22,0,0,0-.09-.07l-13.5-8.52a.28.28,0,0,0-.18-.06h-17a.41.41,0,0,0-.18,0L47.65,54.3a.32.32,0,0,0-.12.11L39.3,69a.3.3,0,0,0,0,.16V87.65a.34.34,0,0,0,0,.17l8.21,14.51.06.12,14.34,8.77a.41.41,0,0,0,.18,0H78.57a.34.34,0,0,0,.17,0l5.86-3.59A.17.17,0,0,1,84.85,107.77Z"/><path class="cls-1" d="M239.79,57.81V75.09a.33.33,0,0,0,.33.33h23.22a.33.33,0,0,1,.33.33V87h0a.33.33,0,0,1-.33.33H240.12a.33.33,0,0,0-.33.33v23.24h0a.33.33,0,0,1-.33.33h-11.3a.33.33,0,0,1-.33-.33V45.85a.33.33,0,0,1,.33-.33h35.18a.33.33,0,0,1,.33.33v11.3a.33.33,0,0,1-.33.33H240.12A.33.33,0,0,0,239.79,57.81Z"/><rect class="cls-1" x="263.67" y="57.48" width="11.96" height="17.94" rx="0.33"/><path class="cls-1" d="M628,79.9a.37.37,0,0,0,.08-.54l-5.38-6.94a.36.36,0,0,0-.52-.08l-4.66,3.41V62.42h6.71a.36.36,0,0,0,.38-.38V53a.36.36,0,0,0-.38-.38h-6.71v-13a.36.36,0,0,0-.38-.38H607.6a.37.37,0,0,0-.38.38v13h-8.78a.35.35,0,0,0-.37.38v9a.34.34,0,0,0,.37.38h8.78V79.67h-8.76a.33.33,0,0,0-.37.37v9a.35.35,0,0,0,.37.37h8.76v14.44c0,1.79-.89,2.69-3,2.69a31.05,31.05,0,0,1-5.14-.45.37.37,0,0,0-.45.42l1.34,9.9a.33.33,0,0,0,.3.32,39.61,39.61,0,0,0,5.93.49c7.45,0,11.31-3.86,11.31-10.86V87.55Z"/><path class="cls-1" d="M648.88,77.53a.36.36,0,0,0,.49.27l8.9-3.16a.34.34,0,0,0,.24-.41A130.61,130.61,0,0,0,652,52.75a.34.34,0,0,0-.45-.2l-8.45,2.65a.35.35,0,0,0-.23.49C645.4,61.94,647.71,71.82,648.88,77.53Z"/><path class="cls-1" d="M758.44,102.82c-10.39-4.53-20.06-12.92-24.69-22.09h23.91a.36.36,0,0,0,.38-.38V70.87a.36.36,0,0,0-.38-.38H725.81V60h27.45a.35.35,0,0,0,.38-.37V50.13a.35.35,0,0,0-.38-.37H725.81V39.63a.35.35,0,0,0-.37-.38h-9.85a.35.35,0,0,0-.37.38V49.76h-27a.35.35,0,0,0-.37.37v9.49a.35.35,0,0,0,.37.37h27v10.5H683.81a.36.36,0,0,0-.37.38v9.48a.36.36,0,0,0,.37.38h24.7v6.55a.38.38,0,0,0,.38.38h6.33v28.87a.34.34,0,0,0,.37.38h9.85a.35.35,0,0,0,.37-.38V85.37c3.78,10.69,15.93,22.35,26.64,26.84a.35.35,0,0,0,.46-.15l5.7-8.7A.36.36,0,0,0,758.44,102.82Z"/><path class="cls-1" d="M668.62,94.48l4.23-12.81a.75.75,0,0,0,0-.24V40.54a.37.37,0,0,0-.38-.37h-9.66a.36.36,0,0,0-.38.37V80L657,95.65l-18.56,13.51a.38.38,0,0,0-.08.53l5.75,7.92a.37.37,0,0,0,.53.08L663,104l10.41,12.92a.38.38,0,0,0,.54.06l7.33-6.25a.36.36,0,0,0,0-.52Z"/><path class="cls-1" d="M640,105.78V96.1h9.41a.37.37,0,0,0,.37-.38v-9.3a.37.37,0,0,0-.37-.38H640V43.78a.36.36,0,0,0-.38-.38h-9.3a.36.36,0,0,0-.38.38V96.1h-6.45a.36.36,0,0,0-.37.37v9.31a.35.35,0,0,0,.37.37h16.13A.37.37,0,0,0,640,105.78Z"/><path class="cls-1" d="M708.14,87.66h-9.31a.35.35,0,0,0-.37.37v12.19a.36.36,0,0,0,.37.38h9.31a.37.37,0,0,0,.37-.38V88A.36.36,0,0,0,708.14,87.66Z"/><path class="cls-1" d="M698.08,100.59H687a.35.35,0,0,0-.37.37v9.31a.36.36,0,0,0,.37.37h11.1a.36.36,0,0,0,.37-.37V101A.35.35,0,0,0,698.08,100.59Z"/></svg>
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/IPU.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/IPU/IPU.json
+[
+  {
+    "name": "batch_sizes",
+    "note": "The batch sizes run with benchmark",
+    "dialog_type": "Input Dialog",
+    "type": "str",
+    "default": "",
+    "depends": null
+  },
+  {
+    "name": "converter_options",
+    "note": "PopRT converter options will be used, please referes to the PopRT documentation",
+    "dialog_type": "Input Dialog",
+    "type": "str",
+    "default": "{}",
+    "depends": null
+  },
+  {
+    "name": "compiler_options",
+    "note": "PopRT compiler options will be used, please referes to the PopRT documentation",
+    "dialog_type": "Input Dialog",
+    "type": "str",
+    "default": "{}",
+    "depends": null
+  },
+  {
+    "name": "clients",
+    "note": "The number of clients used to run the benchmark",
+    "dialog_type": "Input Dialog",
+    "type": "str",
+    "default": "1",
+    "depends": null
+  },
+  {
+    "name": "pack",
+    "note": "Use pack solution to run the model or not",
+    "dialog_type": "Yes/No Dialog",
+    "type": "bool",
+    "default": "false",
+    "depends": null
+  },
+  {
+    "name": "runtime_options",
+    "note": "PopRT runtime options will be applied to RuntimeConfig, please referes to the PopRT documentation",
+    "dialog_type": "Input Dialog",
+    "type": "str",
+    "default": "{}",
+    "depends": null
+  },
+  {
+    "name": "precision",
+    "note": "请指定模型的精度",
+    "dialog_type": "Radiolist Dialog",
+    "options": [
+      "fp8",
+      "fp16"
+    ],
+    "type": "str",
+    "default": "FP16",
+    "depends": null
+  }
+]