init

24b257f1 · sunzhq2 · 920b3c0f · 24b257f1 · 24b257f1 · 24b257f1
Commit 24b257f1 authored Nov 19, 2024 by sunzhq2
20 changed files
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/clip-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/clip-onnx-fp32.json
+{
+    "model": "clip-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/conformer-encoder-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/conformer-encoder-onnx-fp32.json
+{
+    "model": "conformer-encoder-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": true,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,32,64],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/deberta-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/deberta-torch-fp32.json
+{
+    "model": "deberta-torch-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-onnxruntime-fp16.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-onnxruntime-fp16.json
+{
+    "model": "resnet50-onnxruntime-fp16",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-onnxruntime-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-onnxruntime-fp32.json
+{
+    "model": "resnet50-onnxruntime-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-tf-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-tf-fp32.json
+{
+    "model": "resnet50-tf-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,32,64],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-torch-fp16.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-torch-fp16.json
+{
+    "model": "resnet50-torch-fp16",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/resnet50-torch-fp32.json
+{
+    "model": "resnet50-torch-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/roberta-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/roberta-torch-fp32.json
+{
+    "model": "roberta-torch-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1,4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/roformer-tf-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/roformer-tf-fp32.json
+{
+    "model": "roformer-tf-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[2,4,8,16,32,64],
+    "data_percent": 100,
+    "compile_only": false
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/swin-large-torch-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/swin-large-torch-fp32.json
+{
+    "model": "swin-large-torch-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": true,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[2,4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/unet-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/unet-onnx-fp32.json
+{
+    "model": "unet-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/vae-decoder-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/vae-decoder-onnx-fp32.json
+{
+    "model": "vae-decoder-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/vae-encoder-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/vae-encoder-onnx-fp32.json
+{
+    "model": "vae-encoder-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/videobert-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/videobert-onnx-fp32.json
+{
+    "model": "videobert-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,24],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/widedeep-tf-fp16.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/widedeep-tf-fp16.json
+{
+    "model": "widedeep-tf-fp16",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1024,20000,40000,80000,120000,140000,160000,180000,200000,220000,240000,260000],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/widedeep-tf-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/widedeep-tf-fp32.json
+{
+    "model": "widedeep-tf-fp32",
+    "test_perf": true,
+    "test_accuracy": true,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[1024,20000,40000,80000,120000,140000,160000,180000,200000,220000,240000,260000],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/general_perf/workloads/yolov5-onnx-fp32.json
+++ b/ByteMLPerf/byte_infer_perf/general_perf/workloads/yolov5-onnx-fp32.json
+{
+    "model": "yolov5-onnx-fp32",
+    "test_perf": true,
+    "test_accuracy": false,
+    "test_numeric": false,
+    "clients": 3,
+    "iterations": 100,
+    "batch_sizes":[4,8,16,32],
+    "data_percent": 100,
+    "compile_only": false
+}
\ No newline at end of file
--- a/ByteMLPerf/byte_infer_perf/llm_perf/README.md
+++ b/ByteMLPerf/byte_infer_perf/llm_perf/README.md
+# Byte LLM Perf
+## Requirements
+* Python >= 3.8
+* torch >= 2.1.0
+## Installation
+```shell
+# modify according to torch version and hardware
+pip3 install torch==2.1.0 --index-url https://download.pytorch.org/whl/cu118
+# install required packages
+pip3 install -r requirements.txt
+```
+## Quick Start (run accuracy and performance tests)
+Please be sure to complete the installation steps before proceeding with the following steps: 
+1. Modify task workload, for example, [chatglm2-torch-fp16-6b.json](https://github.com/bytedance/ByteMLPerf/blob/main/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json)
+2. Download model weights using prepare_model.sh or huggingface_cli.
+3. Download model output logits in specific input case(.npy files) using prepare_model.sh.
+4. Start accuracy and performance tests.
+You can run following command automate all steps with chatglm2 model on GPU backend
+```shell
+python3 byte_infer_perf/llm_perf/launch.py --hardware_type GPU --task chatglm2-torch-fp16-6b 
+```
+## Test accuracy (single query with specify prompt)
+Launch a server running mixtral-8x22b (tp_size=8, max_batch_size=8) with following command:
+```shell
+cd byte_infer_perf/llm_perf
+python3 ./server/launch_server.py --hardware_type GPU --model_config ./model_zoo/mixtral-torch-bf16-8x22b.json --tp_size 8 --max_batch_size 8
+```
+Test server with single prompt, and you can get infer result, logits numpy file and model forward time. Output files will locate in `./reports/single_query/`
+```shell
+python3 ./script/single_query.py --prompt "What is 7 multiplied by 7?" --batch_size 8
+```
+## Test model_impl model forward performance
+Only need to instantiate MpEngine running mixtral-8x22b (tp_size=8, max_batch_size=8) and feed proper inputs. Runing following command will get performance outputs. You can modify test cases in `./bench_model.py` currerntly.
+```shell
+python3 ./bench_model.py --hardware_type GPU --model_config ./model_zoo/mixtral-torch-bf16-8x22b.json --tp_size 8 --max_batch_size 8
+```
+The output will located in `./reports/{hardware_type}/{model_config}/bench_model`:
+- **config.json**: perf config
+- **context_perf.csv**: prefill, latency with specified {batch_size, seq_len}
+- **decode_perf.csv**: decode, latency with specified {batch_size, seq_len}
+- **output.txt**: raw latency data
+## Demo Project
+[GPU Backend](https://github.com/bytedance/ByteMLPerf/tree/main/byte_infer_perf/llm_perf/backends/GPU) provides a demo project that realizes llm inference of chatglm2-6b on A100 with following features: 
+- Separate functional components:
+    * Scheduler 
+        - custom scheduling on tasks
+    * Inferencer
+        - transfer tasks to real inputs and get outputs
+    * Mp Engine
+        - deal with TP logic using multiple processes
+    * Sampler
+        - postprocess logic
+    * Ckpt Loader
+        - custom ckpt loader with split logic which matches TP logic.
+    * Custom model implementation
+        - custom model implementation using hardware backend torch realization
+- Seperate scheduling logic
+    * Context: one task, input_ids shape is [1, q_len]
+    * Decode: multiple tasks, input_ids shape up to [max_batch_size, 1]
+- Tensor parallelism
+- kv cache
+The demo project is intended to provide a reference implementation, and there's no guarantee of achieving optimal performance. More technical details will be provided later on [ByteMLPerf](https://bytemlperf.ai)
+## Vendor Integration
+Vendors can refer to this document for guidance on building backend: [Byte LLM Perf](https://bytemlperf.ai/zh/guide/inference_llm_vendor.html)
+## Models
+The following models are planned to be supported:
+* [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
+* [shenzhi-wang/Llama3-70B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-70B-Chinese-Chat)
+* [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
+    - test_accuracy is unavailable temporarily.
+* [mistralai/Mixtral-8x22B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
+    - test_accuracy is unavailable temporarily.
--- a/ByteMLPerf/byte_infer_perf/llm_perf/backends/GPU/gpu_ckpt_loader.py
+++ b/ByteMLPerf/byte_infer_perf/llm_perf/backends/GPU/gpu_ckpt_loader.py
+import torch
+import torch.distributed as dist
+from llm_perf.core.ckpt_loader import CoreCkptLoader
+class GpuCkptLoader(CoreCkptLoader):
+    def __init__(
+        self, 
+        prefix, model, 
+        mp_size=1, mp_rank=0, 
+        ckpt_path: str=""
+    ):
+        super().__init__(prefix, model, mp_size, mp_rank, ckpt_path)
+    def weight_to_device(self, weight : torch.Tensor, non_blocking=False):
+        if self.mp_rank == 0:
+            weight = weight.cuda(non_blocking=non_blocking)
+        else:
+            cur_device = torch.cuda.current_device()
+            weight = torch.empty_like(weight, device=f"cuda:{cur_device}")
+        return weight
+    def broadcast_weight(self, key, device='cpu', non_blocking=False):   
+        if self.mp_rank != 0:
+            tensor_shape = self.state_dict[key]["shape"]
+            tensor_dtype = self.state_dict[key]["dtype"]
+            tensor = torch.empty(tensor_shape, dtype=tensor_dtype)
+        else:
+            tensor = self.state_dict[key].cpu()
+        tensor_gpu = self.weight_to_device(tensor, non_blocking=non_blocking)
+        dist.broadcast(tensor_gpu, src=0)
+        self.state_dict[key] = tensor_gpu
+    def scatter_weight(self, key, dim, split_mode='default', outter=1, device='cpu', non_blocking=False):
+        self.broadcast_weight(key, non_blocking=non_blocking)
+        weight = self.state_dict[key]
+        if split_mode == 'default':
+            weight_split = self.split(weight, dim)
+        elif split_mode == 'with_outter':
+            weight_split = self.with_outter_split(weight, dim, outter)
+        elif split_mode == 'split_outter':
+            weight_split = self.split(weight, dim, outter)
+        else:
+            assert False, f"unknown split mode {split_mode}"
+        weight_split = [x.contiguous() for x in weight_split]
+        self.state_dict[key] = weight_split[self.mp_rank]