Commit d7117b95 authored by zhouxiang's avatar zhouxiang
Browse files

同步0.2.6代码

parent 5f83e392
...@@ -9,6 +9,7 @@ __pycache__/ ...@@ -9,6 +9,7 @@ __pycache__/
# Distribution / packaging # Distribution / packaging
.Python .Python
triton-rerope/
develop-eggs/ develop-eggs/
dist/ dist/
downloads/ downloads/
...@@ -61,6 +62,7 @@ work_dir*/ ...@@ -61,6 +62,7 @@ work_dir*/
!lmdeploy/turbomind/hf_repo/config.json !lmdeploy/turbomind/hf_repo/config.json
# Pytorch # Pytorch
*.pt
*.pth *.pth
*.py~ *.py~
*.sh~ *.sh~
...@@ -72,6 +74,8 @@ work_dir*/ ...@@ -72,6 +74,8 @@ work_dir*/
*.log *.log
*.out *.out
*.csv *.csv
!start_ids.csv
*.pkl *.pkl
!CMakeLists.txt !CMakeLists.txt
proxy_config.yml
...@@ -3,7 +3,7 @@ repos: ...@@ -3,7 +3,7 @@ repos:
rev: 4.0.1 rev: 4.0.1
hooks: hooks:
- id: flake8 - id: flake8
args: ["--exclude=lmdeploy/turbomind/triton_models/*"] args: ["--exclude=lmdeploy/turbomind/triton_models/*,lmdeploy/pytorch/modeling/*", "--max-line-length=79"]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
...@@ -12,6 +12,12 @@ repos: ...@@ -12,6 +12,12 @@ repos:
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
name: yapf
description: 'Formatter for Python code'
entry: yapf
language: python
args: ['-i', '--style={based_on_style: pep8, column_limit: 79}']
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0 rev: v4.2.0
hooks: hooks:
......
version: 2
formats: all
build:
os: "ubuntu-22.04"
tools:
python: "3.8"
python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/readthedocs.txt
...@@ -291,10 +291,33 @@ print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');" ...@@ -291,10 +291,33 @@ print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');"
endif() endif()
endif() endif()
# turn off warnings on windows
if (MSVC)
foreach(
flag_var
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL
CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL
CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_CUDA_FLAGS
CMAKE_CUDA_FLAGS_DEBUG
CMAKE_CUDA_FLAGS_RELEASE
CMAKE_CUDA_FLAGS_MINSIZEREL
CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "-Wall" " /W0 " ${flag_var} "${${flag_var}}")
endforeach()
endif()
if (BUILD_MULTI_GPU) if (BUILD_MULTI_GPU)
list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH}) list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
#list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib) #list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
list(APPEND COMMON_LIB_DIRS /opt/mpi/lib) # list(APPEND COMMON_LIB_DIRS /opt/mpi/lib)
endif() endif()
if(USE_TRITONSERVER_DATATYPE) if(USE_TRITONSERVER_DATATYPE)
...@@ -376,7 +399,8 @@ add_library(transformer-shared SHARED ...@@ -376,7 +399,8 @@ add_library(transformer-shared SHARED
if (BUILD_MULTI_GPU) if (BUILD_MULTI_GPU)
target_link_libraries(transformer-shared PUBLIC target_link_libraries(transformer-shared PUBLIC
-lmpi #-lmpi
${MPI_CXX_LIBRARIES}
${NCCL_LIBRARIES} ${NCCL_LIBRARIES}
) )
endif() endif()
......
<div align="center"> <div align="center">
<img src="resources/lmdeploy-logo.svg" width="450"/> <img src="docs/en/_static/image/lmdeploy-logo.svg" width="450"/>
[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/)
[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
[![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy)
[![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
[![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
[![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
[📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) |
[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) |
[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
English | [简体中文](README_zh-CN.md) English | [简体中文](README_zh-CN.md)
</div> 👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx)
[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm)
[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d)
<p align="center"> </div>
👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
</p>
______________________________________________________________________ ______________________________________________________________________
## News 🎉 ## Latest News 🎉
<details open>
<summary><b>2024</b></summary>
- \[2024/03\] Support VLM offline inference pipeline and serving.
- \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.
- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md).
- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md)
- \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable rapid experimentation with new features and technologies.
</details>
<details close>
<summary><b>2023</b></summary>
- \[2023/11\] Turbomind supports loading hf model directly. Click [here](./docs/en/load_hf.md) for details. - \[2023/12\] Turbomind supports multimodal input. [Gradio Demo](./examples/vl/README.md)
- \[2023/11\] Turbomind supports loading hf model directly. Click [here](docs/en/inference/load_hf.md) for details.
- \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75 - \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
- \[2023/09\] TurboMind supports Qwen-14B - \[2023/09\] TurboMind supports Qwen-14B
- \[2023/09\] TurboMind supports InternLM-20B - \[2023/09\] TurboMind supports InternLM-20B
...@@ -29,75 +47,71 @@ ______________________________________________________________________ ...@@ -29,75 +47,71 @@ ______________________________________________________________________
- \[2023/08\] TurboMind supports flash-attention2. - \[2023/08\] TurboMind supports flash-attention2.
- \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
- \[2023/08\] TurboMind supports Windows (tp=1) - \[2023/08\] TurboMind supports Windows (tp=1)
- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info - \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation. Check [this](docs/en/quantization/w4a16.md) guide for detailed info
- \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models. - \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
- \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm. - \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.
- \[2023/07\] TurboMind supports Llama-2 70B with GQA. - \[2023/07\] TurboMind supports Llama-2 70B with GQA.
- \[2023/07\] TurboMind supports Llama-2 7B/13B. - \[2023/07\] TurboMind supports Llama-2 7B/13B.
- \[2023/07\] TurboMind supports tensor-parallel inference of InternLM. - \[2023/07\] TurboMind supports tensor-parallel inference of InternLM.
</details>
______________________________________________________________________ ______________________________________________________________________
## Introduction # Introduction
LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features: LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features:
- **Efficient Inference Engine (TurboMind)**: Based on [FasterTransformer](https://github.com/NVIDIA/FasterTransformer), we have implemented an efficient inference engine - TurboMind, which supports the inference of LLaMA and its variant models on NVIDIA GPUs. - **Efficient Inference**: LMDeploy delivers up to 1.8x higher request throughput than vLLM, by introducing key features like persistent batch(a.k.a. continuous batching), blocked KV cache, dynamic split&fuse, tensor parallelism, high-performance CUDA kernels and so on.
- **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, it remembers dialogue history, thus avoiding repetitive processing of historical sessions.
- **Multi-GPU Model Deployment and Quantization**: We provide comprehensive model deployment and quantification support, and have been validated at different scales.
- **Persistent Batch Inference**: Further optimization of model execution efficiency.
![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e)
## Supported Models
`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
### TurboMind - **Effective Quantization**: LMDeploy supports weight-only and k/v quantization, and the 4-bit inference performance is 2.4x higher than FP16. The quantization quality has been confirmed via OpenCompass evaluation.
> **Note**<br /> - **Effortless Distribution Server**: Leveraging the request distribution service, LMDeploy facilitates an easy and efficient deployment of multi-model services across multiple machines and cards.
> W4A16 inference requires Nvidia GPU with Ampere architecture or above.
| Models | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 | - **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, the engine remembers dialogue history, thus avoiding repetitive processing of historical sessions.
| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
| SOLAR | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | Yes | No |
| QWen-14B | Yes | Yes | Yes | Yes | No |
| Baichuan-7B | Yes | Yes | Yes | Yes | No |
| Baichuan2-7B | Yes | Yes | Yes | Yes | No |
| Code Llama | Yes | Yes | No | No | No |
### Pytorch # Performance
| Models | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 | ![v0 1 0-benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/8e455cf1-a792-4fa8-91a2-75df96a2a5ba)
| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM-7B | Yes | Yes | No | No | No |
## Performance For detailed inference benchmarks in more devices and more settings, please refer to the following link:
**Case I**: output token throughput with fixed input token and output token number (1, 2048) - [A100](./docs/en/benchmark/a100_fp16.md)
- V100
- 4090
- 3090
- 2080
**Case II**: request throughput with real conversation data # Supported Models
Test Setting: LLaMA-7B, NVIDIA A100(80G) | Model | Size |
| :----------------: | :--------: |
| Llama | 7B - 65B |
| Llama2 | 7B - 70B |
| InternLM | 7B - 20B |
| InternLM2 | 7B - 20B |
| InternLM-XComposer | 7B |
| QWen | 7B - 72B |
| QWen1.5 | 0.5B - 72B |
| QWen-VL | 7B |
| Baichuan | 7B - 13B |
| Baichuan2 | 7B - 13B |
| Code Llama | 7B - 34B |
| ChatGLM2 | 6B |
| Falcon | 7B - 180B |
| YI | 6B - 34B |
| Mistral | 7B |
| DeepSeek-MoE | 16B |
| Mixtral | 8x7B |
| Gemma | 2B-7B |
The output token throughput of TurboMind exceeds 2000 tokens/s, which is about 5% - 15% higher than DeepSpeed overall and outperforms huggingface transformers by up to 2.3x. LMDeploy has developed two inference engines - [TurboMind](./docs/en/inference/turbomind.md) and [PyTorch](./docs/en/inference/pytorch.md), each with a different focus. The former strives for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers.
And the request throughput of TurboMind is 30% higher than vLLM.
![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774) They differ in the types of supported models and the inference data type. Please refer to [this table](./docs/en/supported_models/supported_models.md) for each engine's capability and choose the proper one that best fits your actual needs.
## Quick Start # Quick Start
### Installation ## Installation
Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md) Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
...@@ -105,121 +119,54 @@ Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md) ...@@ -105,121 +119,54 @@ Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
pip install lmdeploy pip install lmdeploy
``` ```
> **Note**<br /> The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:
> `pip install lmdeploy` can only install the runtime required packages. If users want to run codes from modules like `lmdeploy.lite` and `lmdeploy.serve`, they need to install the extra required packages.
> For instance, running `pip install lmdeploy[lite]` would install extra dependencies for `lmdeploy.lite` module.
>
> - `all`: Install lmdeploy with all dependencies in `requirements.txt`
> - `lite`: Install lmdeploy with extra dependencies in `requirements/lite.txt`
> - `serve`: Install lmdeploy with dependencies in `requirements/serve.txt`
### Deploy InternLM
To use TurboMind inference engine, you need to first convert the model into TurboMind format. Currently, we support online conversion and offline conversion. With online conversion, TurboMind can load the Huggingface model directly. While with offline conversion, you should save the converted model first before using it.
The following use [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) as a example to show how to use turbomind with online conversion. You can refer to [load_hf.md](docs/en/load_hf.md) for other methods.
#### Inference by TurboMind
```shell
lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b
```
> **Note**<br /> The internlm/internlm-chat-7b model will be downloaded under `.cache` folder. You can also use a local path here.
> **Note**<br />
> When inferring with FP16 precision, the InternLM-7B model requires at least 15.7G of GPU memory overhead on TurboMind. <br />
> It is recommended to use NVIDIA cards such as 3090, V100, A100, etc.
> Disable GPU ECC can free up 10% memory, try `sudo nvidia-smi --ecc-config=0` and reboot system.
> **Note**<br />
> Tensor parallel is available to perform inference on multiple GPUs. Add `--tp=<num_gpu>` on `chat` to enable runtime TP.
#### Serving with gradio
```shell
# install lmdeploy with extra dependencies
pip install lmdeploy[serve]
lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b
```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
#### Serving with Restful API
Launch inference server by:
```shell
# install lmdeploy with extra dependencies
pip install lmdeploy[serve]
lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
```
Then, you can communicate with it by command line,
```shell ```shell
# api_server_url is what printed in api_server.py, e.g. http://localhost:23333 export LMDEPLOY_VERSION=0.2.0
lmdeploy serve api_client api_server_url export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
``` ```
or webui, ## Offline Batch Inference
```shell ```python
# api_server_url is what printed in api_server.py, e.g. http://localhost:23333 import lmdeploy
# server_ip and server_port here are for gradio ui pipe = lmdeploy.pipeline("internlm/internlm-chat-7b")
# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 response = pipe(["Hi, pls intro yourself", "Shanghai is"])
lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port} print(response)
``` ```
Refer to [restful_api.md](docs/en/restful_api.md) for more details. > \[!NOTE\]
> By default, LMDeploy downloads model from HuggingFace. If you would like to use models from ModelScope, please install ModelScope by `pip install modelscope` and set the environment variable:
### Inference with PyTorch >
> `export LMDEPLOY_USE_MODELSCOPE=True`
For detailed instructions on Inference pytorch models, see [here](docs/en/pytorch.md).
#### Single GPU
```shell
lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
#### Tensor Parallel with DeepSpeed
```shell
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
$NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
You need to install deepspeed first to use this feature.
```
pip install deepspeed
```
## Quantization For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
#### Weight INT4 Quantization # Tutorials
LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization Please overview [getting_started](./docs/en/get_started.md) section for the basic usage of LMDeploy.
[Click here](./docs/en/w4a16.md) to view the test results for weight int4 usage. For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):
#### KV Cache INT8 Quantization - User Guide
- [LLM Inference pipeline](./docs/en/inference/pipeline.md)
- [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md)
- [LLM Serving](docs/en/serving/api_server.md)
- [VLM Serving](docs/en/serving/api_server_vl.md)
- [Quantization](docs/en/quantization)
- Advance Guide
- [Inference Engine - TurboMind](docs/en/inference/turbomind.md)
- [Inference Engine - PyTorch](docs/en/inference/pytorch.md)
- [Customize chat templates](docs/en/advance/chat_template.md)
- [Add a new model](docs/en/advance/pytorch_new_model.md)
- gemm tuning
- [Long context inference](docs/en/advance/long_context.md)
- [Multi-model inference service](docs/en/serving/proxy_server.md)
[Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8. # Third-party projects
> **Warning**<br /> - Deploying LLMs offline on the NVIDIA Jetson platform by LMDeploy: [LMDeploy-Jetson](https://github.com/BestAnHongjun/LMDeploy-Jetson)
> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
## Contributing ## Contributing
...@@ -229,6 +176,8 @@ We appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](. ...@@ -229,6 +176,8 @@ We appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](.
- [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) - [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
- [llm-awq](https://github.com/mit-han-lab/llm-awq) - [llm-awq](https://github.com/mit-han-lab/llm-awq)
- [vLLM](https://github.com/vllm-project/vllm)
- [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)
## License ## License
......
<div align="center"> <div align="center">
<img src="resources/lmdeploy-logo.svg" width="450"/> <img src="docs/en/_static/image/lmdeploy-logo.svg" width="450"/>
[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy-zh-cn.readthedocs.io/zh_CN/latest/)
[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
[![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy) [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy)
[![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE) [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
[![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
[![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues) [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
[📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) |
[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) |
[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
[English](README.md) | 简体中文 [English](README.md) | 简体中文
</div> 👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx)
[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm)
[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d)
<p align="center"> </div>
👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
</p>
______________________________________________________________________ ______________________________________________________________________
## 更新 🎉 ## 最新进展 🎉
<details open>
<summary><b>2024</b></summary>
- \[2024/03\] 支持视觉-语言模型(VLM)的离线推理 pipeline 和推理服务
- \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型
- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布,支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/api_server.md)
- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md)
- \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md),作为 TurboMind 引擎的补充。帮助降低开发门槛,和快速实验新特性、新技术
</details>
<details close>
<summary><b>2023</b></summary>
- \[2023/11\] Turbomind 支持直接读取 Huggingface 模型。点击[这里](./docs/en/load_hf.md)查看使用方法 - \[2023/12\] Turbomind 支持多模态输入。[Gradio Demo](./examples/vl/README.md)
- \[2023/11\] Turbomind 支持直接读取 Huggingface 模型。点击[这里](docs/zh_cn/inference/load_hf.md)查看使用方法
- \[2023/11\] TurboMind 重磅升级。包括:Paged Attention、更快的且不受序列最大长度限制的 attention kernel、2+倍快的 KV8 kernels、Split-K decoding (Flash Decoding) 和 支持 sm_75 架构的 W4A16 - \[2023/11\] TurboMind 重磅升级。包括:Paged Attention、更快的且不受序列最大长度限制的 attention kernel、2+倍快的 KV8 kernels、Split-K decoding (Flash Decoding) 和 支持 sm_75 架构的 W4A16
- \[2023/09\] TurboMind 支持 Qwen-14B - \[2023/09\] TurboMind 支持 Qwen-14B
- \[2023/09\] TurboMind 支持 InternLM-20B 模型 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
...@@ -29,76 +47,72 @@ ______________________________________________________________________ ...@@ -29,76 +47,72 @@ ______________________________________________________________________
- \[2023/08\] TurboMind 支持 flash-attention2 - \[2023/08\] TurboMind 支持 flash-attention2
- \[2023/08\] TurboMind 支持 Qwen-7B,动态NTK-RoPE缩放,动态logN缩放 - \[2023/08\] TurboMind 支持 Qwen-7B,动态NTK-RoPE缩放,动态logN缩放
- \[2023/08\] TurboMind 支持 Windows (tp=1) - \[2023/08\] TurboMind 支持 Windows (tp=1)
- \[2023/08\] TurboMind 支持 4-bit 推理,速度是 FP16 的 2.4 倍,是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md) - \[2023/08\] TurboMind 支持 4-bit 推理,速度是 FP16 的 2.4 倍,是目前最快的开源实现。部署方式请看[这里](docs/zh_cn/quantization/w4a16.md)
- \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ,提供开箱即用的 4-bit 模型 - \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ,提供开箱即用的 4-bit 模型
- \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化 - \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化
- \[2023/07\] TurboMind 支持使用 GQA 的 Llama-2 70B 模型 - \[2023/07\] TurboMind 支持使用 GQA 的 Llama-2 70B 模型
- \[2023/07\] TurboMind 支持 Llama-2 7B/13B 模型 - \[2023/07\] TurboMind 支持 Llama-2 7B/13B 模型
- \[2023/07\] TurboMind 支持 InternLM 的 Tensor Parallel 推理 - \[2023/07\] TurboMind 支持 InternLM 的 Tensor Parallel 推理
</details>
______________________________________________________________________ ______________________________________________________________________
## 简介 # 简介
LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy)[MMRazor](https://github.com/open-mmlab/mmrazor) 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。 LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy)[MMRazor](https://github.com/open-mmlab/mmrazor) 团队联合开发,是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。
这个强大的工具箱提供以下核心功能: 这个强大的工具箱提供以下核心功能:
- **高效推理引擎 TurboMind**:基于 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer),我们实现了高效推理引擎 TurboMind,支持 InternLM、LLaMA、vicuna等模型在 NVIDIA GPU 上的推理。 - **高效推理**:LMDeploy 开发了 Persistent Batch(即 Continuous Batch),Blocked K/V Cache,动态拆分和融合,张量并行,高效的计算 kernel等重要特性。推理性能是 vLLM 的 1.8 倍
- **交互推理方式**:通过缓存多轮对话过程中 attention 的 k/v,记住对话历史,从而避免重复处理历史会话 - **可靠的量化**:LMDeploy 支持权重量化和 k/v 量化。4bit 模型推理效率是 FP16 下的 2.4 倍。量化模型的可靠性已通过 OpenCompass 评测得到充分验证
- **多 GPU 部署和量化**:我们提供了全面的模型部署和量化支持,已在不同规模上完成验证 - **便捷的服务**:通过请求分发服务,LMDeploy 支持多模型在多机、多卡上的推理服务
- **persistent batch 推理**:进一步优化模型执行效率。 - **有状态推理**:通过缓存多轮对话过程中 attention 的 k/v,记住对话历史,从而避免重复处理历史会话。显著提升长文本多轮对话场景中的效率。
![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e) # 性能
## 支持的模型 LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型上,每秒处理的请求数是 vLLM 的 1.36 ~ 1.85 倍。在静态推理能力方面,TurboMind 4bit 模型推理速度(out token/s)远高于 FP16/BF16 推理。在小 batch 时,提高到 2.4 倍。
`LMDeploy` 支持 `TurboMind``Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表 ![v0 1 0-benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/8e455cf1-a792-4fa8-91a2-75df96a2a5ba)
### TurboMind 更多设备、更多计算精度、更多setting下的的推理 benchmark,请参考以下链接:
> **Note**<br /> - [A100](./docs/en/benchmark/a100_fp16.md)
> W4A16 推理需要 Ampere 及以上架构的 Nvidia GPU - 4090
- 3090
- 2080
| 模型 | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 | # 支持的模型
| :----------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | Yes | Yes | No |
| Llama2 | Yes | Yes | Yes | Yes | No |
| SOLAR | Yes | Yes | Yes | Yes | No |
| InternLM-7B | Yes | Yes | Yes | Yes | No |
| InternLM-20B | Yes | Yes | Yes | Yes | No |
| QWen-7B | Yes | Yes | Yes | Yes | No |
| QWen-14B | Yes | Yes | Yes | Yes | No |
| Baichuan-7B | Yes | Yes | Yes | Yes | No |
| Baichuan2-7B | Yes | Yes | Yes | Yes | No |
| Code Llama | Yes | Yes | No | No | No |
### Pytorch | Model | Size |
| :----------------: | :--------: |
| Llama | 7B - 65B |
| Llama2 | 7B - 70B |
| InternLM | 7B - 20B |
| InternLM2 | 7B - 20B |
| InternLM-XComposer | 7B |
| QWen | 7B - 72B |
| QWen-VL | 7B |
| QWen1.5 | 0.5B - 72B |
| Baichuan | 7B - 13B |
| Baichuan2 | 7B - 13B |
| Code Llama | 7B - 34B |
| ChatGLM2 | 6B |
| Falcon | 7B - 180B |
| YI | 6B - 34B |
| Mistral | 7B |
| DeepSeek-MoE | 16B |
| Mixtral | 8x7B |
| Gemma | 2B-7B |
| 模型 | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 | LMDeploy 支持 2 种推理引擎: [TurboMind](./docs/zh_cn/inference/turbomind.md)[PyTorch](./docs/zh_cn/inference/pytorch.md),它们侧重不同。前者追求推理性能的极致优化,后者纯用python开发,着重降低开发者的门槛。
| :---------: | :------: | :--: | :-----: | :---: | :--: |
| Llama | Yes | Yes | No | No | No |
| Llama2 | Yes | Yes | No | No | No |
| InternLM-7B | Yes | Yes | No | No | No |
## 性能 它们在支持的模型类别、计算精度方面有所差别。用户可参考[这里](./docs/zh_cn/supported_models/supported_models.md), 查阅每个推理引擎的能力,并根据实际需求选择合适的。
**场景一**: 固定的输入、输出token数(1,2048),测试 output token throughput # 快速开始
**场景二**: 使用真实数据,测试 request throughput ## 安装
测试配置:LLaMA-7B, NVIDIA A100(80G)
TurboMind 的 output token throughput 超过 2000 token/s, 整体比 DeepSpeed 提升约 5% - 15%,比 huggingface transformers 提升 2.3 倍
在 request throughput 指标上,TurboMind 的效率比 vLLM 高 30%
![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774)
## 快速上手
### 安装
使用 pip ( python 3.8+) 安装 LMDeploy,或者[源码安装](./docs/zh_cn/build.md) 使用 pip ( python 3.8+) 安装 LMDeploy,或者[源码安装](./docs/zh_cn/build.md)
...@@ -106,117 +120,54 @@ TurboMind 的 output token throughput 超过 2000 token/s, 整体比 DeepSpeed ...@@ -106,117 +120,54 @@ TurboMind 的 output token throughput 超过 2000 token/s, 整体比 DeepSpeed
pip install lmdeploy pip install lmdeploy
``` ```
> **Note**<br /> LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy,请执行以下命令:
> `pip install lmdeploy`默认安装runtime依赖包,使用lmdeploy的lite和serve功能时,用户需要安装额外依赖包。例如: `pip install lmdeploy[lite]` 会额外安装`lmdeploy.lite`模块的依赖包
>
> - `all`: 安装`lmdeploy`所有依赖包,具体可查看`requirements.txt`
> - `lite`: 额外安装`lmdeploy.lite`模块的依赖包,具体可查看`requirements/lite.txt`
> - `serve`: 额外安装`lmdeploy.serve`模块的依赖包,具体可查看`requirements/serve.txt`
### 部署 InternLM
使用 TurboMind 推理模型需要先将模型转化为 TurboMind 的格式,目前支持在线转换和离线转换两种形式。在线转换可以直接加载 Huggingface 模型,离线转换需需要先保存模型再加载。
下面以 [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) 为例,展示在线转换的使用方式。其他方式可参考[load_hf.md](docs/zh_cn/load_hf.md)
#### 使用 turbomind 推理
```shell ```shell
lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b export LMDEPLOY_VERSION=0.2.0
export PYTHON_VERSION=38
pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
``` ```
> **Note**<br /> internlm/internlm-chat-7b 会自动下载到 `.cache` 文件夹,这里也可以传下载好的路径。 ## 离线批处理
> **Note**<br />
> turbomind 在使用 FP16 精度推理 InternLM-7B 模型时,显存开销至少需要 15.7G。建议使用 3090, V100,A100等型号的显卡。<br />
> 关闭显卡的 ECC 可以腾出 10% 显存,执行 `sudo nvidia-smi --ecc-config=0` 重启系统生效。
> **Note**<br />
> 使用 Tensor 并发可以利用多张 GPU 进行推理。在 `chat` 时添加参数 `--tp=<num_gpu>` 可以启动运行时 TP。
#### 启动 gradio server
```shell ```python
# 安装lmdeploy额外依赖 import lmdeploy
pip install lmdeploy[serve] pipe = lmdeploy.pipeline("internlm/internlm-chat-7b")
response = pipe(["Hi, pls intro yourself", "Shanghai is"])
lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b print(response)
``` ```
![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab) > \[!NOTE\]
> LMDeploy 默认从 HuggingFace 上面下载模型,如果要从 ModelScope 上面下载模型,请通过命令 `pip install modelscope` 安装ModelScope,并设置环境变量:
#### 通过 Restful API 部署服务 >
> `export LMDEPLOY_USE_MODELSCOPE=True`
使用下面的命令启动推理服务:
```shell
# 安装lmdeploy额外依赖
pip install lmdeploy[serve]
lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
```
你可以通过命令行方式与推理服务进行对话:
```shell
# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
lmdeploy serve api_client api_server_url
```
也可以通过 WebUI 方式来对话:
```shell
# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
# server_ip and server_port here are for gradio ui
# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
```
更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)
### 基于 PyTorch 的推理
你必须确保环境中有安装 deepspeed:
```
pip install deepspeed
```
#### 单个 GPU
```shell
lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
#### 使用 DeepSpeed 实现张量并行
```shell
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
$NAME_OR_PATH_TO_HF_MODEL \
--max_new_tokens 64 \
--temperture 0.8 \
--top_p 0.95 \
--seed 0
```
## 量化部署 关于 pipeline 的更多推理参数说明,请参考[这里](./docs/zh_cn/inference/pipeline.md)
#### 权重 INT4 量化 # 用户教程
LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进行量化 请阅读[快速上手](./docs/zh_cn/get_started.md)章节,了解 LMDeploy 的基本用法。
[点击这里](./docs/zh_cn/w4a16.md) 查看 weight int4 用法测试结果。 为了帮助用户更进一步了解 LMDeploy,我们准备了用户指南和进阶指南,请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/)
#### KV Cache INT8 量化 - 用户指南
- [LLM 推理 pipeline](./docs/zh_cn/inference/pipeline.md)
- [VLM 推理 pipeline](./docs/zh_cn/inference/vl_pipeline.md)
- [LLM 推理服务](./docs/zh_cn/serving/api_server.md)
- [VLM 推理服务](./docs/zh_cn/serving/api_server_vl.md)
- [模型量化](./docs/zh_cn/quantization)
- 进阶指南
- [推理引擎 - TurboMind](./docs/zh_cn/inference/turbomind.md)
- [推理引擎 - PyTorch](./docs/zh_cn/inference/pytorch.md)
- [自定义对话模板](./docs/zh_cn/advance/chat_template.md)
- [支持新模型](./docs/zh_cn/advance/pytorch_new_model.md)
- gemm tuning
- [长文本推理](./docs/zh_cn/advance/long_context.md)
- [多模型推理服务](./docs/zh_cn/serving/proxy_server.md)
[点击这里](./docs/zh_cn/kv_int8.md) 查看 kv int8 使用方法、实现公式和测试结果。 # 社区项目
> **Warning**<br /> - 使用LMDeploy在英伟达Jetson系列板卡部署大模型:[LMDeploy-Jetson](https://github.com/BestAnHongjun/LMDeploy-Jetson)
> 量化部署不支持运行时 Tensor 并发。如果希望使用 Tensor 并发,需要在 deploy 时配置 tp 参数。
## 贡献指南 ## 贡献指南
...@@ -226,6 +177,8 @@ LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进 ...@@ -226,6 +177,8 @@ LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进
- [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) - [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
- [llm-awq](https://github.com/mit-han-lab/llm-awq) - [llm-awq](https://github.com/mit-han-lab/llm-awq)
- [vLLM](https://github.com/vllm-project/vllm)
- [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)
## License ## License
......
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-13b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 500
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
mv gemm_config.in ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of internlm-20b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=2
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 700
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
cp gemm_config.in ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-70b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=4
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 4000
crudini --set ${config_path} llama max_batch_size 256
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128 256)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 64 128 256 \
--csv ${output_path}/generation.csv
}
output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-7b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 1000
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
mv gemm_config.in ${output_path}
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import argparse import argparse
import csv import csv
import logging
import os import os
import time import time
from dataclasses import dataclass from dataclasses import dataclass
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
from typing import List from typing import List, Union
import numpy as np import numpy as np
# from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
# nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
# nvmlDeviceGetPowerState, nvmlDeviceGetTemperature, nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
# nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion) nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
from tqdm import tqdm from tqdm import tqdm
from lmdeploy.turbomind import TurboMind from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
TurbomindEngineConfig)
def infer(model, session_id: int, input_ids: List, output_seqlen: int, def infer(model, session_id: int, input_ids: List,
top_k: int, top_p: float, temperature: float, test_round: int, gen_config: EngineGenerationConfig, test_round: int, que: Queue):
que: Queue):
if session_id == 1: if session_id == 1:
pbar = tqdm(total=test_round) pbar = tqdm(total=test_round)
chatbot = model.create_instance() chatbot = model.create_instance()
output_seqlen = gen_config.max_new_tokens
stats = [] stats = []
for _ in range(test_round): for _ in range(test_round):
token_latency_stats = [0] * (output_seqlen + 1) token_latency_stats = [0] * (output_seqlen + 1)
...@@ -44,20 +45,19 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int, ...@@ -44,20 +45,19 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
""" # noqa: E501 """ # noqa: E501
for outputs in chatbot.stream_infer(session_id, for outputs in chatbot.stream_infer(session_id,
input_ids, input_ids,
request_output_len=output_seqlen, gen_config=gen_config,
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
ignore_eos=True, stream_output=True):
stream_output=True, _, res, n_token = outputs
top_k=top_k,
top_p=top_p,
temperature=temperature):
_, n_token = outputs[0]
now = time.perf_counter() now = time.perf_counter()
if n_prev_token != n_token: if n_prev_token != n_token:
token_latency_stats[n_prev_token] = np.round(now - prev, 3) token_latency_stats[n_prev_token] = np.round(now - prev, 3)
n_prev_token = n_token n_prev_token = n_token
prev = now prev = now
# for pytorch engine to restart a session
if hasattr(chatbot, 'end'):
chatbot.end(session_id)
if session_id == 1: if session_id == 1:
pbar.update(1) pbar.update(1)
...@@ -68,12 +68,13 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int, ...@@ -68,12 +68,13 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
que.put((session_id, stats)) que.put((session_id, stats))
def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int, def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int,
warmup_round: int): gen_config: EngineGenerationConfig):
if not warmup_round: if not warmup_round:
return return
print('start to warmup ...') print('start to warmup ...')
output_seqlen = gen_config.max_new_tokens
def _infer(model, session_id): def _infer(model, session_id):
chatbot = model.create_instance() chatbot = model.create_instance()
...@@ -84,15 +85,16 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int, ...@@ -84,15 +85,16 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
ignore_eos=True, ignore_eos=True,
top_k=1, gen_config=gen_config):
top_p=1.0,
temperature=1.0):
continue continue
# for pytorch engine to restart a session
if hasattr(chatbot, 'end'):
chatbot.end(session_id)
_start = time.perf_counter() _start = time.perf_counter()
procs = [] procs = []
for i in range(concurrency): for i in range(concurrency):
proc = Thread(target=_infer, args=(model, i + 1)) proc = Thread(target=_infer, args=(model, i + 1), daemon=True)
procs.append(proc) procs.append(proc)
proc.start() proc.start()
...@@ -104,25 +106,27 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int, ...@@ -104,25 +106,27 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
def profile_throughput(model_path: str, concurrency: int, input_seqlen: int, def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
output_seqlen: int, tp: int, top_k: int, top_p: float, engine_config: Union[PytorchEngineConfig,
temperature: float, test_round: int, warmup_round: int, TurbomindEngineConfig],
**kwargs): gen_config: EngineGenerationConfig, test_round: int,
warmup_round: int):
output_seqlen = gen_config.max_new_tokens
print(f'profiling ... concurrency: {concurrency}, ' print(f'profiling ... concurrency: {concurrency}, '
f'n_prompt_token: {input_seqlen}, ' f'n_prompt_token: {input_seqlen}, '
f'n_completion_token: {output_seqlen}, ' f'n_completion_token: {output_seqlen}, '
f'test_round: {test_round}, warmup_round: {warmup_round}') f'test_round: {test_round}, warmup_round: {warmup_round}')
if isinstance(engine_config, TurbomindEngineConfig):
# avoid turbomind checking chat template name by setting `model_name='llama'` # noqa from lmdeploy.turbomind import TurboMind
tm_model = TurboMind(model_path=model_path, tm_model = TurboMind.from_pretrained(model_path,
tp=tp, engine_config=engine_config)
model_name='llama', elif isinstance(engine_config, PytorchEngineConfig):
**kwargs) from lmdeploy.pytorch.engine import Engine
tm_model = Engine(model_path, engine_config)
# make up a dummy `input_ids` with the length of `input_seqlen` exactly # make up a dummy `input_ids` with the length of `input_seqlen` exactly
assert input_seqlen > 0, 'input_seqlen should > 0' assert input_seqlen > 0, 'input_seqlen should > 0'
input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist() input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
warmup(tm_model, concurrency, input_ids, output_seqlen, warmup_round) warmup(tm_model, concurrency, input_ids, warmup_round, gen_config)
que = Queue() que = Queue()
procs = [] procs = []
...@@ -130,8 +134,8 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int, ...@@ -130,8 +134,8 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
for i in range(concurrency): for i in range(concurrency):
proc = Thread(target=infer, proc = Thread(target=infer,
args=(tm_model, i + 1, input_ids, output_seqlen, top_k, args=(tm_model, i + 1, input_ids, gen_config, test_round,
top_p, temperature, test_round, que)) que))
procs.append(proc) procs.append(proc)
proc.start() proc.start()
...@@ -186,76 +190,76 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int, ...@@ -186,76 +190,76 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
percentiles, throughput, tm_model.gpu_count percentiles, throughput, tm_model.gpu_count
# class MemoryMonitor: class MemoryMonitor:
# from multiprocessing import Manager from multiprocessing import Manager
# max_mem = Manager().Value('f', 0) # GB max_mem = Manager().Value('f', 0) # GB
# device_count = Manager().Value('f', 0) device_count = Manager().Value('f', 0)
# @staticmethod @staticmethod
# def nvidia_info(): def nvidia_info():
# # pip install nvidia-ml-py # pip install nvidia-ml-py
# nvidia_dict = { nvidia_dict = {
# 'state': True, 'state': True,
# 'nvidia_version': '', 'nvidia_version': '',
# 'nvidia_count': 0, 'nvidia_count': 0,
# 'gpus': [] 'gpus': []
# } }
# try: try:
# nvmlInit() nvmlInit()
# nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion() nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
# nvidia_dict['nvidia_count'] = nvmlDeviceGetCount() nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
# for i in range(nvidia_dict['nvidia_count']): for i in range(nvidia_dict['nvidia_count']):
# handle = nvmlDeviceGetHandleByIndex(i) handle = nvmlDeviceGetHandleByIndex(i)
# memory_info = nvmlDeviceGetMemoryInfo(handle) memory_info = nvmlDeviceGetMemoryInfo(handle)
# gpu = { gpu = {
# 'gpu_name': nvmlDeviceGetName(handle), 'gpu_name': nvmlDeviceGetName(handle),
# 'total': memory_info.total, 'total': memory_info.total,
# 'free': memory_info.free, 'free': memory_info.free,
# 'used': memory_info.used, 'used': memory_info.used,
# 'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃', 'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
# 'powerStatus': nvmlDeviceGetPowerState(handle) 'powerStatus': nvmlDeviceGetPowerState(handle)
# } }
# nvidia_dict['gpus'].append(gpu) nvidia_dict['gpus'].append(gpu)
# except NVMLError as _: # noqa except NVMLError as _: # noqa
# nvidia_dict['state'] = False nvidia_dict['state'] = False
# except Exception as _: # noqa except Exception as _: # noqa
# nvidia_dict['state'] = False nvidia_dict['state'] = False
# finally: finally:
# try: try:
# nvmlShutdown() nvmlShutdown()
# except: # noqa except: # noqa
# pass pass
# return nvidia_dict return nvidia_dict
# @classmethod @classmethod
# def mem_monitor(cls): def mem_monitor(cls):
# info = cls.nvidia_info() info = cls.nvidia_info()
# max_mem = 0 max_mem = 0
# mem_start = 0 mem_start = 0
# cls.device_count.value = len(info['gpus']) cls.device_count.value = len(info['gpus'])
# for used_total in info['gpus']: for used_total in info['gpus']:
# mem_start += used_total['used'] mem_start += used_total['used']
# while True: while True:
# info = cls.nvidia_info() info = cls.nvidia_info()
# used = 0 used = 0
# for used_total in info['gpus']: for used_total in info['gpus']:
# used += used_total['used'] used += used_total['used']
# if used > max_mem: if used > max_mem:
# max_mem = used max_mem = used
# cls.max_mem.value = (max_mem - mem_start) / (1 << 30) cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
# @classmethod @classmethod
# def start(cls): def start(cls):
# cls._running = True cls._running = True
# from multiprocessing import Process from multiprocessing import Process
# cls.proc = Process(target=cls.mem_monitor) cls.proc = Process(target=cls.mem_monitor, daemon=True)
# cls.proc.start() cls.proc.start()
# @classmethod @classmethod
# def terminate(cls) -> float: def terminate(cls) -> float:
# """Terminate the subprocess and return maximum memory.""" """Terminate the subprocess and return maximum memory."""
# cls.proc.kill() cls.proc.kill()
# return cls.max_mem.value return cls.max_mem.value
@dataclass @dataclass
...@@ -274,66 +278,97 @@ class ProfileResult: ...@@ -274,66 +278,97 @@ class ProfileResult:
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Regression Test') parser = argparse.ArgumentParser(
description='Profile the token generation performance with'
' pytorch or turbomind engine',
formatter_class=DefaultsAndTypesHelpFormatter)
parser.add_argument('model_path', parser.add_argument('model_path',
type=str, type=str,
help='the path of the model in localhost or ' help='the path of the model in localhost or '
'the repo_id of the model in huggingface.co') 'the repo_id of the model in huggingface.co')
parser.add_argument('--concurrency', parser.add_argument('-c',
'--concurrency',
nargs='+', nargs='+',
type=int, type=int,
help='how many requests launched concurrently', help='how many requests launched concurrently',
default=[1, 16, 32, 64]) default=[1, 16, 32, 64])
parser.add_argument( parser.add_argument(
'-pt',
'--prompt-tokens', '--prompt-tokens',
nargs='+', nargs='+',
type=int, type=int,
help='how many requests launched concurrently. One-to-one' help='how many requests launched concurrently. One-to-one '
'correspondence with completion-tokens', 'correspondence with completion-tokens',
default=[1, 128, 128, 2048, 2048]) default=[1, 128, 128, 2048, 2048])
parser.add_argument('--completion-tokens', parser.add_argument('-ct',
'--completion-tokens',
nargs='+', nargs='+',
type=int, type=int,
help='how many tokens to be generated. One-to-one' help='how many tokens to be generated. One-to-one'
'correspondence with prompt-tokens', 'correspondence with prompt-tokens',
default=[128, 128, 2048, 128, 2048]) default=[128, 128, 2048, 128, 2048])
parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
parser.add_argument('--top_k',
type=int,
help='The number of highest probability vocabulary '
'tokens to keep for top-k-filtering',
default=1)
parser.add_argument('--top_p',
type=float,
help='the set of most probable tokens with '
'probabilities that add up to top_p or higher '
'are kept for generation',
default=1.0)
parser.add_argument('--temperature',
type=float,
help='The value used to modulate the next token '
'probabilities',
default=1.0)
parser.add_argument('--csv', parser.add_argument('--csv',
type=str, type=str,
help='Where to save the result.', help='Where to save the result.',
default='profile_generation.csv') default='profile_generation.csv')
parser.add_argument('--log-level', parser.add_argument('-tr',
help='set log level', '--test-round',
default='ERROR',
choices=list(logging._nameToLevel.keys()))
parser.add_argument('--test-round',
type=int, type=int,
help='number of test rounds', help='number of test rounds',
default=6) default=3)
parser.add_argument('--warmup-round', parser.add_argument('-w',
'--warmup-round',
type=int, type=int,
help='number of warmuop rounds', help='number of warmup rounds',
default=1) default=1)
# other args
ArgumentHelper.top_p(parser)
ArgumentHelper.temperature(parser)
ArgumentHelper.top_k(parser)
ArgumentHelper.log_level(parser)
ArgumentHelper.backend(parser)
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
tp_act = ArgumentHelper.tp(pt_group)
cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
# turbomind engine args
tb_group = parser.add_argument_group('TurboMind engine argument')
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(cache_count_act)
ArgumentHelper.model_format(tb_group, default='hf')
args = parser.parse_args() args = parser.parse_args()
return args return args
def __proc_cb(*args, ret_pipe, target):
try:
ret = target(*args)
ret_pipe[1].send(ret)
except Exception as e:
ret_pipe[1].send(e)
def _process_map(target, iterable):
from multiprocessing import Pipe, get_context
pipe = Pipe(False)
spawn_context = get_context('spawn')
proc = spawn_context.Process(target=__proc_cb,
args=iterable,
kwargs=dict(ret_pipe=pipe, target=target))
proc.start()
proc.join()
ret = pipe[0].recv()
if isinstance(ret, Exception):
raise ret
return ret
def main(): def main():
args = parse_args() args = parse_args()
assert len(args.prompt_tokens) == len(args.completion_tokens), \ assert len(args.prompt_tokens) == len(args.completion_tokens), \
...@@ -342,30 +377,49 @@ def main(): ...@@ -342,30 +377,49 @@ def main():
os.environ['TM_LOG_LEVEL'] = args.log_level os.environ['TM_LOG_LEVEL'] = args.log_level
results: List[ProfileResult] = [] results: List[ProfileResult] = []
for batch in args.concurrency: for batch in args.concurrency:
for prompt_tokens, completion_tokens in zip(args.prompt_tokens, for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
args.completion_tokens): args.completion_tokens):
# MemoryMonitor.start() MemoryMonitor.start()
from functools import partial from functools import partial
from multiprocessing import Pool
profile_target = partial(profile_throughput, # make sure session_len >= prompt_tokens + completion_tokens
concurrency=batch, session_len = max(args.session_len,
input_seqlen=prompt_tokens, prompt_tokens + completion_tokens)
output_seqlen=completion_tokens, if args.backend == 'turbomind':
engine_config = TurbomindEngineConfig(
cache_max_entry_count=args.cache_max_entry_count,
model_format=args.model_format,
session_len=session_len,
tp=args.tp)
elif args.backend == 'pytorch':
engine_config = PytorchEngineConfig(
cache_max_entry_count=args.cache_max_entry_count,
session_len=session_len,
tp=args.tp, tp=args.tp,
thread_safe=True)
gen_config = EngineGenerationConfig(
top_k=args.top_k, top_k=args.top_k,
top_p=args.top_p, top_p=args.top_p,
temperature=args.temperature, temperature=args.temperature,
max_new_tokens=completion_tokens,
ignore_eos=True)
profile_target = partial(
profile_throughput,
concurrency=batch,
input_seqlen=prompt_tokens,
engine_config=engine_config,
gen_config=gen_config,
test_round=args.test_round, test_round=args.test_round,
warmup_round=args.warmup_round) warmup_round=args.warmup_round,
output = Pool(1).map(profile_target, (args.model_path, )) )
output = _process_map(profile_target, (args.model_path, ))
model_name, first_token_latency, percentiles, \ model_name, first_token_latency, percentiles, \
throughput_per_proc, tp = output[0] throughput_per_proc, tp = output
time.sleep(5) # wait a while for releasing GPU mem time.sleep(5) # wait a while for releasing GPU mem
# memory = MemoryMonitor.terminate() memory = MemoryMonitor.terminate()
# device_count = MemoryMonitor.device_count.value device_count = MemoryMonitor.device_count.value
memory=0
device_count=0
results.append( results.append(
ProfileResult(model_name=model_name, ProfileResult(model_name=model_name,
batch=batch, batch=batch,
......
...@@ -75,8 +75,9 @@ import torch ...@@ -75,8 +75,9 @@ import torch
from transformers import AutoModelForCausalLM, GenerationConfig from transformers import AutoModelForCausalLM, GenerationConfig
from lmdeploy.pytorch.accel import LoadNoInit from lmdeploy.pytorch.accel import LoadNoInit
from lmdeploy.utils import get_logger
logger = logging.getLogger(__file__) logger = get_logger(__file__)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
info = logger.info info = logger.info
warning = logger.warning warning = logger.warning
......
...@@ -28,6 +28,9 @@ def sample_requests( ...@@ -28,6 +28,9 @@ def sample_requests(
dataset = [(data['conversations'][0]['value'], dataset = [(data['conversations'][0]['value'],
data['conversations'][1]['value']) for data in dataset] data['conversations'][1]['value']) for data in dataset]
# pre-sample to avoid go through all the dataset
dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
# Tokenize the prompts and completions. # Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset] prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids prompt_token_ids = tokenizer(prompts).input_ids
...@@ -204,8 +207,8 @@ class Engine: ...@@ -204,8 +207,8 @@ class Engine:
def main(server_addr: str, def main(server_addr: str,
tokenizer_path: str, tokenizer_path: str,
dataset: str, dataset: str,
concurrency: int = 64, concurrency: int = 128,
num_prompts: int = 2000, num_prompts: int = 5000,
top_p: float = 1.0, top_p: float = 1.0,
temperature: float = 1.0, temperature: float = 1.0,
stream_output: bool = False, stream_output: bool = False,
...@@ -218,8 +221,8 @@ def main(server_addr: str, ...@@ -218,8 +221,8 @@ def main(server_addr: str,
tokenizer_path (str): Path to the tokenizer model in localhost tokenizer_path (str): Path to the tokenizer model in localhost
dataset (str): Path to the dataset dataset (str): Path to the dataset
concurrency (int, optional): Number of working threads to process the sampled prompts. concurrency (int, optional): Number of working threads to process the sampled prompts.
Defaults to 64. Defaults to 128.
num_prompts (int, optional): Number of prompts to process. Defaults to 2000. num_prompts (int, optional): Number of prompts to process. Defaults to 5000.
top_p (float, optional): the set of most probable tokens with top_p (float, optional): the set of most probable tokens with
probabilities that add up to top_p or higher probabilities that add up to top_p or higher
are kept for generation. Defaults to 1.0. are kept for generation. Defaults to 1.0.
......
...@@ -28,6 +28,9 @@ def sample_requests( ...@@ -28,6 +28,9 @@ def sample_requests(
dataset = [(data['conversations'][0]['value'], dataset = [(data['conversations'][0]['value'],
data['conversations'][1]['value']) for data in dataset] data['conversations'][1]['value']) for data in dataset]
# pre-sample to avoid go through all the dataset
dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
# Tokenize the prompts and completions. # Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset] prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids prompt_token_ids = tokenizer(prompts).input_ids
...@@ -80,7 +83,6 @@ class Engine: ...@@ -80,7 +83,6 @@ class Engine:
chatbot = Chatbot(self.server_addr, chatbot = Chatbot(self.server_addr,
ignore_eos=True, ignore_eos=True,
profile_serving=True,
top_k=self.top_k, top_k=self.top_k,
top_p=self.top_p, top_p=self.top_p,
temperature=self.temperature, temperature=self.temperature,
...@@ -150,6 +152,7 @@ class Engine: ...@@ -150,6 +152,7 @@ class Engine:
session_id, _stats = res_queue.get() session_id, _stats = res_queue.get()
# print(f'\n{"-" * 50}\n' # print(f'\n{"-" * 50}\n'
# f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n') # f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
if len(_stats) != 0:
stats.append(np.array(_stats)) stats.append(np.array(_stats))
stats = np.concatenate(stats).reshape(-1, 5) stats = np.concatenate(stats).reshape(-1, 5)
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import argparse
import csv import csv
import json import json
import os import os
...@@ -6,14 +7,16 @@ import random ...@@ -6,14 +7,16 @@ import random
import time import time
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
from typing import List, Tuple from typing import List, Tuple, Union
import fire
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from lmdeploy.tokenizer import Tokenizer from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
from lmdeploy.turbomind import TurboMind from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
TurbomindEngineConfig)
from lmdeploy.pytorch.engine.engine import EngineInstance
from lmdeploy.tokenizer import DetokenizeState, Tokenizer
def sample_requests( def sample_requests(
...@@ -30,6 +33,9 @@ def sample_requests( ...@@ -30,6 +33,9 @@ def sample_requests(
dataset = [(data['conversations'][0]['value'], dataset = [(data['conversations'][0]['value'],
data['conversations'][1]['value']) for data in dataset] data['conversations'][1]['value']) for data in dataset]
# pre-sample to avoid go through all the dataset
dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
# Tokenize the prompts and completions. # Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset] prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids prompt_token_ids = tokenizer(prompts).input_ids
...@@ -59,19 +65,25 @@ def sample_requests( ...@@ -59,19 +65,25 @@ def sample_requests(
class Engine: class Engine:
def __init__(self, model_path: str, tp: int, csv: str, **kwargs): def __init__(self, model_path: str,
# avoid turbomind checking chat template name by setting engine_config: Union[PytorchEngineConfig,
# `model_name='llama'` TurbomindEngineConfig], csv: str):
tm_model = TurboMind(model_path=model_path, if isinstance(engine_config, TurbomindEngineConfig):
model_name='llama', from lmdeploy.turbomind import TurboMind
tp=tp, tm_model = TurboMind.from_pretrained(model_path,
**kwargs) engine_config=engine_config)
elif isinstance(engine_config, PytorchEngineConfig):
from lmdeploy.pytorch.engine import Engine as PytorchEngine
tm_model = PytorchEngine(model_path, engine_config=engine_config)
self.tm_model = tm_model self.tm_model = tm_model
self.tokenizer = tm_model.tokenizer self.tokenizer = tm_model.tokenizer
self.csv = csv self.csv = csv
self.pbar = None self.pbar = None
def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
temperature: float, top_p: float, top_k: int,
stream_output: bool): stream_output: bool):
model_inst = self.tm_model.create_instance() model_inst = self.tm_model.create_instance()
stats = [] stats = []
...@@ -80,31 +92,35 @@ class Engine: ...@@ -80,31 +92,35 @@ class Engine:
for prompt, input_seqlen, output_seqlen in iter( for prompt, input_seqlen, output_seqlen in iter(
req_queue.get, [None, None, None]): req_queue.get, [None, None, None]):
_per_token_latency_stats = [0] * (output_seqlen + 1) _per_token_latency_stats = [0] * (output_seqlen + 1)
offset = 0 state = DetokenizeState()
prev = time.perf_counter() prev = time.perf_counter()
n_prev_token = 0 n_prev_token = 0
input_ids = self.tokenizer(prompt).input_ids input_ids = self.tokenizer(prompt).input_ids
for outputs in model_inst.stream_infer( for outputs in model_inst.stream_infer(
session_id, session_id,
input_ids=input_ids, input_ids=input_ids,
request_output_len=output_seqlen, gen_config=EngineGenerationConfig(
temperature=1.0, max_new_tokens=output_seqlen,
top_p=1.0, temperature=temperature,
top_p=top_p,
top_k=top_k,
ignore_eos=True),
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
ignore_eos=True,
stream_output=stream_output): stream_output=stream_output):
res, n_token = outputs[0] _, res, n_token = outputs
self.tokenizer.decode(res, offset) _, state = self.tokenizer.detokenize_incrementally(res, state)
offset = n_token
now = time.perf_counter() now = time.perf_counter()
if n_prev_token != n_token: if n_prev_token != n_token:
_per_token_latency_stats[n_prev_token] = np.round( _per_token_latency_stats[n_prev_token] = np.round(
now - prev, 3) now - prev, 3)
n_prev_token = n_token n_prev_token = n_token
prev = now prev = now
# for pytorch engine to restart a session
if isinstance(model_inst, EngineInstance):
model_inst.end(session_id)
assert output_seqlen <= n_token <= output_seqlen + 1, \ assert output_seqlen <= n_token <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \ f'Error. session_id({session_id}) request {output_seqlen} ' \
f'tokens, but generate {n_token} tokens.\n' \ f'tokens, but generate {n_token} tokens.\n' \
...@@ -122,10 +138,8 @@ class Engine: ...@@ -122,10 +138,8 @@ class Engine:
self.pbar.update(1) self.pbar.update(1)
res_queue.put((session_id, stats, per_token_latency_stats)) res_queue.put((session_id, stats, per_token_latency_stats))
def process_request(self, def process_request(self, requests, concurrency, temperature, top_p, top_k,
requests, stream_output):
concurrency: int = 1,
stream_output: bool = True):
res_queue = Queue() res_queue = Queue()
req_queue = Queue() req_queue = Queue()
threads = [] threads = []
...@@ -143,7 +157,9 @@ class Engine: ...@@ -143,7 +157,9 @@ class Engine:
# start threads # start threads
for i in range(concurrency): for i in range(concurrency):
t = Thread(target=self._inference, t = Thread(target=self._inference,
args=(req_queue, res_queue, i, stream_output)) args=(req_queue, res_queue, i, temperature, top_p,
top_k, stream_output),
daemon=True)
t.start() t.start()
threads.append(t) threads.append(t)
...@@ -225,53 +241,90 @@ class Engine: ...@@ -225,53 +241,90 @@ class Engine:
]) ])
def main(dataset: str, def parse_args():
model_path: str, parser = argparse.ArgumentParser(
concurrency: int = 64, description='Benchmark the request throughput of lmdeploy '
num_prompts: int = 2000, 'in localhost',
tp: int = 1, formatter_class=DefaultsAndTypesHelpFormatter)
top_k: int = 1, parser.add_argument('dataset', type=str, help='the path dataset')
top_p: float = 1.0, parser.add_argument('model_path',
temperature: float = 1.0, type=str,
stream_output: bool = True, help='the path of the model in localhost or '
csv: str = './profile_throughput.csv', 'the repo_id of the model in huggingface.co')
log_level: str = 'ERROR', parser.add_argument(
seed: int = 0): '-c',
"""Benchmark the request throughput of lmdeploy in localhost. '--concurrency',
type=int,
Args: help='Number of working threads to process the sampled prompts',
dataset (str): Path to the dataset default=256)
model_path (str): Path to a model in localhost or a model_repo_id in huggingface.co parser.add_argument('-n',
concurrency (int, optional): Number of working threads to process the sampled prompts. '--num-prompts',
Defaults to 64. type=int,
num_prompts (int, optional): Number of prompts to process. Defaults to 2000. help='Number of prompts to process',
tp (int, optional): Number of GPUs for tensor parallel. Defaults to 1. default=5000)
top_k (int, optional): The number of highest probability vocabulary tokens parser.add_argument('--csv',
to keep for top-k-filtering. Defaults to 1. type=str,
top_p (float, optional): the set of most probable tokens with help='Where to save the result.',
probabilities that add up to top_p or higher default='./profile_throughput.csv')
are kept for generation. Defaults to 1.0. parser.add_argument('--seed',
temperature (float, optional): The value used to modulate the next token probabilities. type=int,
Defaults to 1.0. default=0,
stream_output (bool, optional): Indicator for streaming output. Defaults to True. help='Seed used in sampling prompts from dataset')
csv (str, optional): The path to save the result. # other args
log_level(str, optional): The log level. Defaults to INFO ArgumentHelper.top_p(parser)
seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0. ArgumentHelper.temperature(parser)
""" # noqa ArgumentHelper.top_k(parser)
random.seed(seed) ArgumentHelper.log_level(parser)
os.environ['TM_LOG_LEVEL'] = log_level ArgumentHelper.backend(parser)
engine = Engine(model_path, # pytorch engine args
tp=tp, pt_group = parser.add_argument_group('PyTorch engine arguments')
top_k=top_k, tp_act = ArgumentHelper.tp(pt_group)
top_p=top_p, session_len_act = ArgumentHelper.session_len(pt_group, default=4096)
temperature=temperature, cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
csv=csv)
# turbomind engine args
requests = sample_requests(dataset, num_prompts, engine.tokenizer) tb_group = parser.add_argument_group('TurboMind engine argument')
tb_group._group_actions.append(tp_act)
engine.process_request(requests, concurrency, stream_output) tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(cache_count_act)
ArgumentHelper.model_format(tb_group, default='hf')
args = parser.parse_args()
return args
def main():
args = parse_args()
random.seed(args.seed)
os.environ['TM_LOG_LEVEL'] = args.log_level
if args.backend == 'turbomind':
engine_config = TurbomindEngineConfig(
session_len=args.session_len,
max_batch_size=args.concurrency,
tp=args.tp,
cache_max_entry_count=args.cache_max_entry_count,
model_format=args.model_format)
elif args.backend == 'pytorch':
engine_config = PytorchEngineConfig(
session_len=args.session_len,
cache_max_entry_count=args.cache_max_entry_count,
max_batch_size=args.concurrency,
tp=args.tp,
thread_safe=True)
engine = Engine(args.model_path, engine_config, csv=args.csv)
requests = sample_requests(args.dataset, args.num_prompts,
engine.tokenizer)
engine.process_request(requests,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
concurrency=args.concurrency,
stream_output=True)
if __name__ == '__main__': if __name__ == '__main__':
fire.Fire(main) main()
cmake .. -A x64 -T v142,cuda="$env:CUDA_PATH" ` cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
-DCMAKE_BUILD_TYPE=Release ` -DCMAKE_BUILD_TYPE=Release `
-DCMAKE_INSTALL_PREFIX=install ` -DCMAKE_INSTALL_PREFIX=install `
-DBUILD_PY_FFI=ON ` -DBUILD_PY_FFI=ON `
......
FROM nvcr.io/nvidia/tritonserver:22.12-py3 FROM nvcr.io/nvidia/tritonserver:22.12-py3
RUN rm /etc/apt/sources.list.d/cuda*.list && apt-get update && apt-get install -y --no-install-recommends \ RUN rm /etc/apt/sources.list.d/cuda*.list && apt-get update && apt-get install -y --no-install-recommends \
rapidjson-dev libgoogle-glog-dev gdb \ rapidjson-dev libgoogle-glog-dev gdb python3.8-venv \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py38
RUN python3 -m pip install --no-cache-dir torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 ENV PATH=/opt/py38/bin:$PATH
RUN python3 -m pip install --no-cache-dir cmake packaging
RUN python3 -m pip install --no-cache-dir --upgrade pip &&\
python3 -m pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118 &&\
python3 -m pip install --no-cache-dir cmake packaging wheel
ENV NCCL_LAUNCH_MODE=GROUP ENV NCCL_LAUNCH_MODE=GROUP
...@@ -29,7 +32,7 @@ RUN cd /opt/lmdeploy &&\ ...@@ -29,7 +32,7 @@ RUN cd /opt/lmdeploy &&\
-DUSE_NVTX=ON &&\ -DUSE_NVTX=ON &&\
make -j$(nproc) && make install &&\ make -j$(nproc) && make install &&\
cd .. &&\ cd .. &&\
python3 -m pip install . &&\ python3 -m pip install -e . &&\
rm -rf build rm -rf build
ENV LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH
.header-logo { .header-logo {
background-image: url("../image/lmdeploy-logo.png"); background-image: url("../image/lmdeploy-logo.svg");
background-size: 150px 60px; background-size: 257px 60px;
height: 60px; height: 60px;
width: 150px; width: 257px;
}
@media screen and (min-width: 1100px) {
.header-logo {
top: -15px;
}
}
pre {
white-space: pre;
}
@media screen and (min-width: 2000px) {
.pytorch-content-left {
width: 1200px;
margin-left: 30px;
}
article.pytorch-article {
max-width: 1200px;
}
.pytorch-breadcrumbs-wrapper {
width: 1200px;
}
.pytorch-right-menu.scrolling-fixed {
position: fixed;
top: 45px;
left: 1580px;
}
}
article.pytorch-article section code {
padding: .2em .4em;
background-color: #f3f4f7;
border-radius: 5px;
}
/* Disable the change in tables */
article.pytorch-article section table code {
padding: unset;
background-color: unset;
border-radius: unset;
}
table.autosummary td {
width: 50%
}
img.align-center {
display: block;
margin-left: auto;
margin-right: auto;
}
article.pytorch-article p.rubric {
font-weight: bold;
} }
resources/lmdeploy-logo.png
\ No newline at end of file
lmdeploy.lite
-------------
.. automodule:: lmdeploy.lite
:members:
lmdeploy.pytorch
----------------
.. automodule:: lmdeploy.pytorch
:members:
lmdeploy.serve
--------------
.. automodule:: lmdeploy.serve
:members:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment