init

24b257f1 · sunzhq2 · 920b3c0f · 24b257f1 · 24b257f1 · 24b257f1
Commit 24b257f1 authored Nov 19, 2024 by sunzhq2
20 changed files
--- a/ByteMLPerf/.gitignore
+++ b/ByteMLPerf/.gitignore
+__pycache__
+*.pyc
+*.prototxt
+*.deploy
+.vscode/
+*.npy
+*.tar
+span.log
+byte_micro_perf/backends/*/venv/
+byte_micro_perf/reports/
+byte_infer_perf/general_perf/tools/venv/
+byte_infer_perf/general_perf/backends/*/venv/
+byte_infer_perf/general_perf/model_zoo/*
+!byte_infer_perf/general_perf/model_zoo/*.json
+byte_infer_perf/general_perf/download/*.*
+!byte_infer_perf/general_perf/download/README.md
+byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
+byte_infer_perf/general_perf/datasets/*
+!byte_infer_perf/general_perf/datasets/fake_dataset
+!*.py
+byte_infer_perf/general_perf/reports/*
+!byte_infer_perf/general_perf/_inference/general_perf/reports/README
+format_code.sh
+init_env.sh
+
+byte_infer_perf/llm_perf/download
+byte_infer_perf/llm_perf/model_zoo/sota
+byte_infer_perf/llm_perf/reports
+
+
+workspace
+test
\ No newline at end of file
--- a/ByteMLPerf/CONTRIBUTING.md
+++ b/ByteMLPerf/CONTRIBUTING.md
+<!-- omit in toc -->
+# Contributing to Byte MLPerf
+
+First of all, thanks for taking the time to contribute!
+
+All types of contributions are encouraged and valued. See the [Table of Contents](#table-of-contents) for different ways to help and details about how this project handles them. Please make sure to read the relevant section before making your contribution. It will make it a lot easier for our maintainers and smooth out the experience for all involved. The community looks forward to your contributions.
+
+> And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about:
+> - Star the project
+> - Tweet about it
+> - Refer this project in your project's readme
+> - Mention the project at local meetups and tell your friends/colleagues
+
+<!-- omit in toc -->
+## Table of Contents
+
+- [Contributor License Agreementt](#contributor-license-agreement)
+- [Pull Requests](#pull-requests)
+- [I Have a Question](#i-have-a-question)
+
+## Contributor License Agreement
+
+Thank you for your interest in contributing to open source projects hosted or managed by Bytedance Ltd. and/or its Affiliates ("ByteDance"). In order to clarify the intellectual property license granted with Contributions from any person or entity, ByteDance must have a Contributor License Agreement ("CLA") on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of ByteDance and its users; it does not change your rights to use your own Contributions for any other purpose.
+
+- If you work for a company that wants to allow you to contribute your work, then you'll need to sign a corporate CLA.
+
+- If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an individual CLA.
+
+- If you have not already done so, please complete and sign, then scan and email a pdf file of this Agreement to opensource-cla@bytedance.com. Please read this document carefully before signing and keep a copy for your records.   
+
+##  Pull Requests
+We actively welcome your pull requests.
+
+- Fork the repo and create your branch from `master`.
+- If you've changed APIs, update the documentation.
+- Make sure your code lints.
+- If you haven't already, complete the Contributor License Agreement ("CLA").
+
+
+## I Have a Question
+
+> If you want to ask a question, we assume that you have read the available [Documentation]().
+
+Before you ask a question, it is best to search for existing [Issues](https://github.com/bytedance/ByteMLPerf/issues) that might help you. In case you have found a suitable issue and still need clarification, you can write your question in this issue. It is also advisable to search the internet for answers first.
+
+If you then still feel the need to ask a question and need clarification, we recommend the following:
+
+- Open an [Issue](https://github.com/bytedance/ByteMLPerf/issues/new).
+- Provide as much context as you can about what you're running into.
+- Provide project and platform versions, depending on what seems relevant.
+
+We will then take care of the issue as soon as possible.
--- a/ByteMLPerf/LICENSE
+++ b/ByteMLPerf/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/ByteMLPerf/NOTICE
+++ b/ByteMLPerf/NOTICE
+ByteMLPerf
+Copyright 2023 ByteDance Ltd. and/or its affiliates.
\ No newline at end of file
--- a/ByteMLPerf/README.md
+++ b/ByteMLPerf/README.md
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+## Category
+The ByteMLPerf benchmark is structured into three main categories: Inference, Training, and Micro, each targeting different aspects of AI accelerator performance:
+
+- Inference: This category is subdivided into two distinct sections to cater to different types of models:
+
+  - General Performance: This section is dedicated to evaluating the inference capabilities of accelerators using common models such as ResNet-50 and BERT. It aims to provide a broad understanding of the accelerator's performance across a range of typical tasks. Vendors can refer to this document for guidance on building general perf backend: [ByteMLPerf General Perf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+  - Large Language Model (LLM) Performance: Specifically designed to assess the capabilities of accelerators in handling large language models, this section addresses the unique challenges posed by the size and complexity of these models. Vendors can refer to this document for guidance on building llm perf backend: [ByteMLPerf LLM Perf Guide](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) [[中文版](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc)]
+
+- Micro: The Micro category focuses on the performance of specific operations or "ops" that are fundamental to AI computations, such as Gemm, Softmax, and various communication operations. This granular level of testing is crucial for understanding the capabilities and limitations of accelerators at a more detailed operational level. Vendors can refer to this document for guidance on building micro perf backend: [ByteMLPerf Micro Perf Guide](https://bytedance.us.larkoffice.com/docx/EpjFdSpRsoOIHWxtKgjuRsMPsFB)[[中文版](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc)]
+
+- Training: Currently under development, this category aims to evaluate the performance of AI accelerators in training scenarios. It will provide insights into how well accelerators can handle the computationally intensive process of training AI models, which is vital for the development of new and more advanced AI systems.
+
+Vendors looking to evaluate and improve their AI accelerators can utilize the ByteMLPerf benchmark as a comprehensive guide. The benchmark not only offers a detailed framework for performance and accuracy evaluation but also includes considerations for compiler usability and coverage for ASIC hardware, ensuring a holistic assessment approach.
+
+For more details, you can visit our offical website here: [bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
--- a/ByteMLPerf/README.zh_CN.md
+++ b/ByteMLPerf/README.zh_CN.md
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，ByteMLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+## 类别
+ByteMLPerf 基准分为三个主要类别：推理（Inference）、训练（Training）和微观性能（Micro），每个类别针对 AI 加速器性能的不同方面：
+
+- Inference：此类别进一步细分为两个部分，以适应不同类型的模型：
+  - General Perf：此部分致力于使用常见模型（如 ResNet-50 和 BERT）评估加速器的推理能力。其目的是提供加速器在一系列典型任务中性能的广泛理解。想要接入General Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference General Perf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) 
+  - LLM Perf：专门设计用于评估加速器处理大型语言模型的能力，此部分解决了这些模型的大小和复杂性带来的独特挑战。想要接入LLM Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference LLM Perf厂商接入指南](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) 
+
+- Micro：Micro Perf侧重于评估特定操作或“运算”（如 Gemm、Softmax 和各种通信操作）的性能，这些操作是 AI 计算的基础。这种详细级别的测试对于了解加速器在更细致的操作层面的能力和限制至关重要。想要接入Micro Perf的厂商可以参考该文档接入测试：[ByteMLPerf Micro Perf厂商接入指南](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc) 
+
+- Training：目前正在开发中的此类别旨在评估 AI 加速器在训练场景中的性能。它将提供关于加速器如何处理训练 AI 模型的计算密集过程的见解，这对于开发新的和更先进的 AI 系统至关重要。
+
+希望评估和改进其 AI 加速器的供应商可以使用 ByteMLPerf 基准作为全面的指南。该基准不仅提供了性能和准确性评估的详细框架，还包括了 ASIC 硬件的编译器可用性和覆盖范围的考虑，确保了全面的评估方法。
+
+更多细节您可以访问我们的官方网站:[bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
--- a/ByteMLPerf/VERSION
+++ b/ByteMLPerf/VERSION
+major=1
+minor=0
+patch=0
--- a/ByteMLPerf/byte_infer_perf/general_perf/README.md
+++ b/ByteMLPerf/byte_infer_perf/general_perf/README.md
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf(Inference) is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+Vendors can refer to this document for guidance on building backend: [ByteMLPerf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+## Usage
+The user uses launch.py as the entry point. When using Byte MLPerf to evaluate the model, you only need to pass in two parameters --task and --hardware_type, as shown below:
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. task
+--task parameter is the name of the incoming workload. You need to specify the workload. For example, if you would like to evaluate the workload: bert-tf-fp16.json, you need to specify --task bert-tf-fp16.
+Note: All workloads are defined under general_perf/workloads, and the name needs to be aligned with the file name when passing parameters. The current format is model-framework-precision.
+
+2. hardware_type
+--hardware_type parameter is the incoming hardware_type name, there is no default value, it must be specified by the user. Example: To evaluate Habana Goya, specify --hardware_type GOYA .
+Note: All hardware types are defined under general_perf/backends, and the name needs to be aligned with the folder name when passing parameters.
+
+3. compile_only
+--compile_only parameter will make task stoped once compilation is finished
+
+4. show_task_list
+--show_task_list parameter will print all task name
+
+5. show_hardware_list
+--show_hardware_list parameter will print all hardware backend
+
+### Workload Description
+A workload definition needs to contain the following fields:
+```javascript
+{
+    "model": "bert-torch-fp32",   //The name of the model to be evaluated, which needs to be aligned with the model_zoo name
+    "test_perf": true,            //Evaluate model performance
+    "test_accuracy": true,        //Evaluate model accuracy
+    "test_numeric": true,         //Accuracy：Evaluate model numeric
+    "clients": 3,                 //Performance：Client threads that submit data
+    "iterations": 100,            //Performance：How many iterations are submitted by each thread
+    "batch_sizes":[1,4,8,16,32,64],//Performance：The batch size when each thread submits data
+    "data_percent": 50,           //Accuracy：Ratio of data to assess accuracy, [1-100]
+    "compile_only": false,           //Compile the model only
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+The models supported by Byte MLPerf are collected under the Model Zoo. From the perspective of access rights, they are currently divided into internal models and open models. Released with Byte MLPerf is the open model included in the corresponding version.
+
+Open model collection principles:
+- Basic Model: including Resnet50, Bert and WnD;
+- Popular Model：Includes models currently widely used in the industry;
+- SOTA: including SOTA models corresponding to business domains;
+
+In addition to the complete model structure, Byte MLPerf will also add some typical model substructure subgraphs or OPs (provided that the open model cannot find a suitable model containing such classic substructures), such as transformer encoder/decoder with different sequence lengths , all kinds of common conv ops, such as group conv, depwise-conv, point-wise conv, and rnn common structures, such as gru/lstm, etc.
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+
+### ByteIR
+
+The ByteIR Project is a ByteDance model compilation solution. ByteIR includes compiler, runtime, and frontends, and provides an end-to-end model compilation solution.
+
+Although all ByteIR components (compiler/runtime/frontends) are together to provide an end-to-end solution, and all under the same umbrella of this repository, each component technically can perform independently.
+
+For More Information, please refer to [ByteIR](https://github.com/bytedance/byteir)
+
+Models Supported By ByteIR:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](general_perf/backends/IPU/README.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](general_perf/backends/HPU/README.md) |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
--- a/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
+++ b/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf（推理）是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，Byte MLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+厂商可以参考该文档接入测试：[ByteMLPerf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) [[English Version](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof)]
+
+## Usage
+用户使用入口为launch.py, 在使用byte mlperf评估时，只需传入--task 、--hardware_type 两个参数，如下所示：
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. tasks
+--task 参数为传入的workload 名字，需要指定评估workload，例如：若要评估 open_bert-tf-fp16.json 定义的 workload，则需指定   --task open_bert-tf-fp16 。
+注：所有workload定义在general_perf/workloads下，传参时名字需要和文件名对齐。目前格式为model-framework-precision。
+
+2. hardware_type
+--hardware_type 参数为传入的hardware_type 名字，无默认值，必须用户指定。例如：若要评估 Habana Goya ，则需指定   --hardware_type GOYA 。
+注：所有hardware type定义在general_perf/backends下，传参时名字需要和folder名对齐。
+
+3. compile_only
+--compile_only 参数将在模型编译完成后停止任务
+
+4. show_task_list
+--show_task_list 参数会打印所有任务名字
+
+5. show_hardware_list
+--show_hardware_list 参数会打印目前所有支持的硬件Backend名称
+
+### Workload说明
+一个workload定义需包含如下字段:
+```javascript
+{
+    "model": "bert-torch-fp32",   //待评估模型的名字，需要和model_zoo名字对齐
+    "test_perf": true,            //是否评估模型性能
+    "test_accuracy": true,        //是否评估模型精度
+    "test_numeric": true,         //精度：是否评估数值误差
+    "clients": 3,                 //性能：提交数据的client threads
+    "iterations": 100,            //性能：每个thread提交多少iteration
+    "batch_sizes":[1,4,8,16,32],  //性能：每个thread提交数据时的bs
+    "data_percent": 50,           //精度：使用百分多少数据集评估精度, [1-100]
+    "compile_only": false,        //是否仅编译模型
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+Model Zoo下收录了Byte MlPerf支持的模型，从访问权限上，目前分为内部模型、开放模型。随Byte MlPerf 发布的是对应版本收录的开放模型。
+Dataset为模型需要用到数据集，对应的dataloader、accuracy_checker从结构上也归入Dataset。
+
+开放模型收录原则：
+- 基础模型：包含十分常见的Rn50、Bert和WnD；
+- 业务类似：包含目前内部较多的、或结构相似的模型结构；
+- 前沿模型：包含业务领域对应的SOTA模型；
+
+此外，除了完整模型结构，Byte MlPerf还会加入一些典型模型子结构子图或OP（前提是开放模型无法找到合适的完整模型包含这类经典子结构），比如各不同序列长度的transformer encoder/decoder，各类常见conv op，如group conv、depwise-conv、point-wise conv，以及rnn 常见结构，如gru/lstm等。
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+
+### ByteIR
+
+ByteIR项目是字节跳动的模型编译解决方案。ByteIR包括编译器、运行时和前端，并提供端到端的模型编译解决方案。 尽管所有的ByteIR组件（编译器/runtime/前端）一起提供端到端的解决方案，并且都在同一个代码库下，但每个组件在技术上都可以独立运行。
+
+更多信息请查看[ByteIR](https://github.com/bytedance/byteir)
+
+ByteIR 编译支持的模型列表:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.zh_CN.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) |
--- a/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
+import sys
+from packaging.version import parse
+import warnings
+
+from .version import __version__
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Defaults to 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+python3_minimum_version = '3.6.0'
+python_version = digit_version(sys.version.split()[0])
+
+assert (python_version >= digit_version(python3_minimum_version)), \
+    f'PYTHON=={sys.version.split()[0]} is used but incompatible. ' \
+    f'Please install python>={python3_minimum_version}.'
+
+__all__ = ['__version__']
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/CPU.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/CPU.json
+[
+    {
+        "name": "omp",
+        "note": "是否开始OMP？",
+        "dialog_type": "Yes/No Dialog",
+        "type": "bool",
+        "default": false,
+        "depends": null
+    }
+]
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
+import argparse
+import logging
+import os
+import importlib
+import json
+import sys
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("CPUBase")
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", default='resnet50-tf-fp32')
+    parser.add_argument("--hardware_type", default="CPU")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        help="Batch sizes we will test in performace mode")
+    parser.add_argument(
+        "--data_percent",
+        type=int,
+        help=
+        "Data percent we will used in the whole data set when we will test in accuracy mode"
+    )
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine(object):
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+
+    def start_engine(self):
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        log.info("Runing CPU Base...")
+
+        self.compile_backend = init_compile_backend(self.args.hardware_type)
+        self.runtime_backend = init_runtime_backend(self.args.hardware_type)
+        if self.workload:
+            return self.workload_perf(self.workload)
+
+    def workload_perf(self, workload):
+        # set reports dir
+        output_dir = os.path.abspath('general_perf/reports/' + self.args.hardware_type +
+                                     '/' + workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        model_info = self.get_model_info(workload['model'])
+
+        ds = load_dataset(model_info)
+        ds.preprocess()
+
+        compile_info = self.compile_backend.compile({
+            "workload": workload,
+            'model_info': model_info
+        })
+
+        # load runtime backend
+        runtime_backend = self.runtime_backend
+        runtime_backend.configs = compile_info
+        runtime_backend.workload = workload
+        runtime_backend.model_info = model_info
+        runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        if workload['test_accuracy'] or workload['test_numeric']:
+            ds.rebatch(self.args.batch_size)
+            AccuracyChecker = self.get_accuracy_checker(
+                model_info['dataset_name']
+                if model_info['dataset_name'] else 'fake_dataset')
+            AccuracyChecker.runtime_backend = runtime_backend
+            AccuracyChecker.dataloader = ds
+            AccuracyChecker.output_dir = output_dir
+            AccuracyChecker.configs = compile_info
+            AccuracyChecker.calculate_acc(workload['data_percent'])
+
+        return
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str):
+        with open("general_perf/model_zoo/" + model_name + '.json', 'r') as f:
+            model_info = json.load(f)
+        return model_info
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
+#！bin/bash
+# if [ ! -d "general_perf/backends/CPU/venv" ];then
+#     virtualenv -p python3 general_perf/backends/CPU/venv
+#     source general_perf/backends/CPU/venv/bin/activate
+#     general_perf/backends/CPU/venv/bin/python3 -m pip install --upgrade pip  -q
+#     general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+# else
+#     source general_perf/backends/CPU/venv/bin/activate
+#     general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+# fi
+
+python3 general_perf/backends/CPU/calculate_cpu_diff.py --task $1 --batch_size $2
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class CompileBackendCPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+
+    def compile(self, config, dataloader=None):
+        result = {
+            "model":
+            config['model_info']['model'],
+            "framework":
+            config['model_info']['framework'],
+            "compile_precision":
+            config['model_info']['model_precision'],
+            "optimizations":{},
+            "instance_count": 1,
+            "device_count": 128,
+            "input_type":
+            config['model_info']['input_type'].split(","),
+            "max_batch_size":
+            config['model_info']['max_batch_size'],
+            "compile_status":
+            "success",
+            "sg_percent":
+            100,
+            "segments": [
+                {
+                    "sg_idx":
+                    0,
+                    "is_fallback":
+                    False,
+                    "input_tensor_map":
+                    config['model_info']['input_shape'],
+                    "output_tensor_map":
+                    config['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": config['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ]
+        }
+        self.configs = result
+        self.workload = config['workload']
+        self.model_info = config['model_info']
+        return result
+
+    def get_interact_profile(self, config):
+        model_profile = []
+        file_path = "general_perf/backends/CPU/" + self.hardware_type + '.json'
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                model_profile = json.load(f)
+        else:
+            log.info(
+                'File path: {} does not exist, please check'.format(file_path))
+
+        return model_profile
+
+    def get_best_batch_size(self):
+        """
+        Get Best Batch Size for the model
+        """
+        return None
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
+matplotlib
+scikit-learn
+opencv-python-headless
+transformers
+tokenization
+bert-tensorflow==1.0.1
+torchvision
+onnx
+numpy==1.19.2
+tensorflow==2.4.0
+onnxruntime
+torch==1.13.1
+sentencepiece==0.1.96
+pandas==1.3.3
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class RuntimeBackendCPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Tensorflow":
+            entry_rt = self.model_runtimes[0].signatures['serving_default']
+            all_sn_inputs = entry_rt.structured_input_signature
+
+            def get_real_feeds(feeds, sn_inputs):
+                sn_inputs = tf.nest.flatten(sn_inputs, True)
+                real_feeds = {}
+                itr = 0
+                for _, val in feeds.items():
+                    real_feeds[sn_inputs[itr].name] = tf.constant(val)
+                    itr += 1
+                return real_feeds
+
+            real_feeds = get_real_feeds(feeds, all_sn_inputs)
+
+            for model_runtime in self.model_runtimes:
+                with tf.device('/CPU:0'):
+                    _results = model_runtime.signatures['serving_default'](
+                        **real_feeds)
+
+            results = {}
+            for key, val in _results.items():
+                results[key] = val.numpy()
+
+            assert len(results) != 0
+
+        elif self.framework == "Pytorch":
+            input_tensors = []
+            i = 0
+            for key, _ in feeds.items():
+                input_tensors.append(
+                    torch.tensor(feeds[key],
+                                 dtype=pt_dtype_map[self.input_type[i]]).to(
+                                     self.device))
+                i += 1
+            with torch.no_grad():
+                for model_runtime in self.model_runtimes:
+                    results = model_runtime(*input_tensors)
+
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.cpu().numpy()}
+        else:
+            for model_runtime in self.model_runtimes:
+                results = model_runtime.run(None, feeds)
+        return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        report = {}
+        report['BS'] = batch_size
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Tensorflow":
+                with tf.device('/CPU:0'):
+                    model = tf.saved_model.load(
+                        segment['compiled_model'][0]['compiled_obj'])
+            elif self.framework == "Pytorch":
+                self.device = "cpu"
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    torch.device('cpu'))
+                model.eval()
+            else:
+                model = onnxruntime.InferenceSession(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    providers=['CPUExecutionProvider'])
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/DCU.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/DCU.json
+[
+    
+]
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/__init__.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/__init__.py
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/base_compile.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/base_compile.py
+from torch.utils.data import DataLoader as DataLoaderX
+from dataset.dataset import ImageNetDataset,MZJBertDataset,DummyDataset
+from nn_compiler.common.constants import OpType
+from common_compile import SparsertBaseBuilder
+import onnx
+
+def get_onnx_input_info(onnx_model_path):
+    # Load ONNX model
+    model = onnx.load(onnx_model_path)
+
+    # Initialize an empty dictionary to store input names and shapes
+    input_info = {}
+
+    # Iterate through the inputs of the model
+    for input in model.graph.input:
+        input_name = input.name
+        input_shape = [dim.dim_value for dim in input.type.tensor_type.shape.dim]
+        input_info[input_name] = input_shape
+
+    return input_info
+
+def get_model_input_info(onnx_input_info,batch_size):
+    config_input_dict = {}
+    input_shape_dict = {}
+    for input_name,input_shape in onnx_input_info.items():
+        config_input_dict[input_name] = input_name
+        input_shape[0] = batch_size
+        input_shape_dict[input_name] = input_shape
+    return config_input_dict,input_shape_dict
+
+class Resnet50Builder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(Resnet50Builder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # calibration dataset config
+        dataset = ImageNetDataset(self.dataset_dir, transform_file=self.dataset_cfg)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size)
+        self.config.calib_batch = 1
+
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        
+        # you can also set other configs here
+        self.config.do_kl = True
+        self.config.opt_level = 8
+        self.config.total_cores = 1
+
+
+class BertBaseBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(BertBaseBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+
+class AlbertBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(AlbertBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+
+class RobertaBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(RobertaBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = MZJBertDataset(data_path=self.dataset_dir, input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+class ConformerBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(ConformerBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = DummyDataset(input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
+
+class GeneralBuilder(SparsertBaseBuilder):
+    def __init__(self, onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs):
+        super(GeneralBuilder, self).__init__(
+            onnx_path, dump_dir, dataset_dir, dataset_cfg, dtype, batch_size, verify, **kwargs)
+
+    def set_dataset_config(self):
+        # model inputs info
+        onnx_input_info = get_onnx_input_info(self.onnx_path)
+        self.config.input_dict,self.input_shape_dict = get_model_input_info(onnx_input_info,self.batch_size)
+
+        # calibration dataset config
+        dataset = DummyDataset(input_info=self.config.input_dict)
+        self.config.dataloader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, num_workers=4)
+        self.config.calib_batch = 1
+
+        # you can also set other configs here
+        self.config.do_kl = False
+        self.config.opt_level = 5
+        self.config.safe_exp = False
+        self.config.quantized_patterns = [[OpType.BatchMatmul]]
--- a/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/compile_backend_dcu.py
+++ b/ByteMLPerf/byte_infer_perf/general_perf/backends/DCU/compile_backend_dcu.py
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import torch
+import time
+import numpy as np
+import onnxruntime as ort
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendDCU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class CompileBackendDCU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendDCU, self).__init__()
+        self.hardware_type = 'DCU'
+        self.need_reload = False
+        self.model_runtimes = []
+
+
+    def compile(self, config, dataloader=None):
+        result = {
+            "model": config['model_info']['model'],
+            "framework": config['model_info']['framework'],
+            "compile_precision": config['model_info']['model_precision'],
+            "optimizations":{},
+            "instance_count": 1,
+            "device_count": 1,
+            "input_type": 
+            config['model_info']['input_type'],
+            "max_batch_size": 
+            config['model_info']['max_batch_size'],
+            "compile_status": 
+            "success",
+            "sg_percent": 
+            100,
+            "segments": [
+                {
+                    "sg_idx":
+                    0,
+                    "is_fallback": 
+                    False,
+                    "input_tensor_map": 
+                    config['model_info']['input_shape'],
+                    "output_tensor_map": 
+                    config['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": config['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ]
+        }
+        self.configs = result
+        self.workload = config['workload']
+        self.model_info = config['model_info']
+        return result
+
+    def get_interact_profile(self, config):
+        model_profile = []
+        file_path = "general_perf/backends/DCU/" + self.hardware_type + '.json'
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                model_profile = json.load(f)
+        else:
+            log.info(
+                'File path: {} does not exist, please check'.format(file_path))
+
+        return model_profile
+
+    def get_best_batch_size(self):
+        """
+        Get Best Batch Size for the model
+        """
+        return None