"tests/models/electra/test_modeling_tf_electra.py" did not exist on "13deb95a405bbd1037ad233c692d7fd1de9d31e3"
Commit 24b257f1 authored by sunzhq2's avatar sunzhq2
Browse files

init

parent 920b3c0f
{
"model": "clip-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,24],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "conformer-encoder-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": true,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,32,64],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "deberta-torch-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,4,8,16,24],
"data_percent": 100,
"compile_only": false
}
{
"model": "resnet50-onnxruntime-fp16",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "resnet50-onnxruntime-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "resnet50-tf-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,32,64],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "resnet50-torch-fp16",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "resnet50-torch-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,2,4,8,16,32,64,128,256,512,1024],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "roberta-torch-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1,4,8,16,24],
"data_percent": 100,
"compile_only": false
}
{
"model": "roformer-tf-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[2,4,8,16,32,64],
"data_percent": 100,
"compile_only": false
}
{
"model": "swin-large-torch-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": true,
"clients": 3,
"iterations": 100,
"batch_sizes":[2,4,8,16,24],
"data_percent": 100,
"compile_only": false
}
{
"model": "unet-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,24],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "vae-decoder-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,24],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "vae-encoder-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,24],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "videobert-onnx-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,24],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "widedeep-tf-fp16",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1024,20000,40000,80000,120000,140000,160000,180000,200000,220000,240000,260000],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "widedeep-tf-fp32",
"test_perf": true,
"test_accuracy": true,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[1024,20000,40000,80000,120000,140000,160000,180000,200000,220000,240000,260000],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
{
"model": "yolov5-onnx-fp32",
"test_perf": true,
"test_accuracy": false,
"test_numeric": false,
"clients": 3,
"iterations": 100,
"batch_sizes":[4,8,16,32],
"data_percent": 100,
"compile_only": false
}
\ No newline at end of file
# Byte LLM Perf
## Requirements
* Python >= 3.8
* torch >= 2.1.0
## Installation
```shell
# modify according to torch version and hardware
pip3 install torch==2.1.0 --index-url https://download.pytorch.org/whl/cu118
# install required packages
pip3 install -r requirements.txt
```
## Quick Start (run accuracy and performance tests)
Please be sure to complete the installation steps before proceeding with the following steps:
1. Modify task workload, for example, [chatglm2-torch-fp16-6b.json](https://github.com/bytedance/ByteMLPerf/blob/main/byte_infer_perf/llm_perf/workloads/chatglm2-torch-fp16-6b.json)
2. Download model weights using prepare_model.sh or huggingface_cli.
3. Download model output logits in specific input case(.npy files) using prepare_model.sh.
4. Start accuracy and performance tests.
You can run following command automate all steps with chatglm2 model on GPU backend
```shell
python3 byte_infer_perf/llm_perf/launch.py --hardware_type GPU --task chatglm2-torch-fp16-6b
```
## Test accuracy (single query with specify prompt)
Launch a server running mixtral-8x22b (tp_size=8, max_batch_size=8) with following command:
```shell
cd byte_infer_perf/llm_perf
python3 ./server/launch_server.py --hardware_type GPU --model_config ./model_zoo/mixtral-torch-bf16-8x22b.json --tp_size 8 --max_batch_size 8
```
Test server with single prompt, and you can get infer result, logits numpy file and model forward time. Output files will locate in `./reports/single_query/`
```shell
python3 ./script/single_query.py --prompt "What is 7 multiplied by 7?" --batch_size 8
```
## Test model_impl model forward performance
Only need to instantiate MpEngine running mixtral-8x22b (tp_size=8, max_batch_size=8) and feed proper inputs. Runing following command will get performance outputs. You can modify test cases in `./bench_model.py` currerntly.
```shell
python3 ./bench_model.py --hardware_type GPU --model_config ./model_zoo/mixtral-torch-bf16-8x22b.json --tp_size 8 --max_batch_size 8
```
The output will located in `./reports/{hardware_type}/{model_config}/bench_model`:
- **config.json**: perf config
- **context_perf.csv**: prefill, latency with specified {batch_size, seq_len}
- **decode_perf.csv**: decode, latency with specified {batch_size, seq_len}
- **output.txt**: raw latency data
## Demo Project
[GPU Backend](https://github.com/bytedance/ByteMLPerf/tree/main/byte_infer_perf/llm_perf/backends/GPU) provides a demo project that realizes llm inference of chatglm2-6b on A100 with following features:
- Separate functional components:
* Scheduler
- custom scheduling on tasks
* Inferencer
- transfer tasks to real inputs and get outputs
* Mp Engine
- deal with TP logic using multiple processes
* Sampler
- postprocess logic
* Ckpt Loader
- custom ckpt loader with split logic which matches TP logic.
* Custom model implementation
- custom model implementation using hardware backend torch realization
- Seperate scheduling logic
* Context: one task, input_ids shape is [1, q_len]
* Decode: multiple tasks, input_ids shape up to [max_batch_size, 1]
- Tensor parallelism
- kv cache
The demo project is intended to provide a reference implementation, and there's no guarantee of achieving optimal performance. More technical details will be provided later on [ByteMLPerf](https://bytemlperf.ai)
## Vendor Integration
Vendors can refer to this document for guidance on building backend: [Byte LLM Perf](https://bytemlperf.ai/zh/guide/inference_llm_vendor.html)
## Models
The following models are planned to be supported:
* [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
* [shenzhi-wang/Llama3-70B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-70B-Chinese-Chat)
* [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
- test_accuracy is unavailable temporarily.
* [mistralai/Mixtral-8x22B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)
- test_accuracy is unavailable temporarily.
import torch
import torch.distributed as dist
from llm_perf.core.ckpt_loader import CoreCkptLoader
class GpuCkptLoader(CoreCkptLoader):
def __init__(
self,
prefix, model,
mp_size=1, mp_rank=0,
ckpt_path: str=""
):
super().__init__(prefix, model, mp_size, mp_rank, ckpt_path)
def weight_to_device(self, weight : torch.Tensor, non_blocking=False):
if self.mp_rank == 0:
weight = weight.cuda(non_blocking=non_blocking)
else:
cur_device = torch.cuda.current_device()
weight = torch.empty_like(weight, device=f"cuda:{cur_device}")
return weight
def broadcast_weight(self, key, device='cpu', non_blocking=False):
if self.mp_rank != 0:
tensor_shape = self.state_dict[key]["shape"]
tensor_dtype = self.state_dict[key]["dtype"]
tensor = torch.empty(tensor_shape, dtype=tensor_dtype)
else:
tensor = self.state_dict[key].cpu()
tensor_gpu = self.weight_to_device(tensor, non_blocking=non_blocking)
dist.broadcast(tensor_gpu, src=0)
self.state_dict[key] = tensor_gpu
def scatter_weight(self, key, dim, split_mode='default', outter=1, device='cpu', non_blocking=False):
self.broadcast_weight(key, non_blocking=non_blocking)
weight = self.state_dict[key]
if split_mode == 'default':
weight_split = self.split(weight, dim)
elif split_mode == 'with_outter':
weight_split = self.with_outter_split(weight, dim, outter)
elif split_mode == 'split_outter':
weight_split = self.split(weight, dim, outter)
else:
assert False, f"unknown split mode {split_mode}"
weight_split = [x.contiguous() for x in weight_split]
self.state_dict[key] = weight_split[self.mp_rank]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment