同步0.2.6代码

d7117b95 · zhouxiang · 5f83e392 · d7117b95 · d7117b95 · 5f83e392
Commit d7117b95 authored Mar 22, 2024 by zhouxiang
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ __pycache__/
 # Distribution / packaging
 .Python
+triton-rerope/
 develop-eggs/
 dist/
 downloads/
@@ -61,6 +62,7 @@ work_dir*/
 !lmdeploy/turbomind/hf_repo/config.json
 # Pytorch
+*.pt
 *.pth
 *.py~
 *.sh~
@@ -72,6 +74,8 @@ work_dir*/
 *.log
 *.out
 *.csv
+!start_ids.csv
 *.pkl
 !CMakeLists.txt
+proxy_config.yml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
    rev: 4.0.1
    hooks:
      - id: flake8
-        args: ["--exclude=lmdeploy/turbomind/triton_models/*"]
+        args: ["--exclude=lmdeploy/turbomind/triton_models/*,lmdeploy/pytorch/modeling/*", "--max-line-length=79"]
  - repo: https://github.com/PyCQA/isort
    rev: 5.11.5
    hooks:
@@ -12,6 +12,12 @@ repos:
    rev: v0.32.0
    hooks:
      - id: yapf
+        name: yapf
+        description: 'Formatter for Python code'
+        entry: yapf
+        language: python
+        args: ['-i', '--style={based_on_style: pep8, column_limit: 79}']
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.2.0
    hooks:

--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
-version: 2
-formats: all
-build:
-  os: "ubuntu-22.04"
-  tools:
-    python: "3.8"
-python:
-  install:
-    - requirements: requirements/docs.txt
-    - requirements: requirements/readthedocs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -291,10 +291,33 @@ print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');"
  endif()
 endif()
+# turn off warnings on windows
+if (MSVC)
+  foreach(
+    flag_var
+    CMAKE_CXX_FLAGS
+    CMAKE_CXX_FLAGS_DEBUG
+    CMAKE_CXX_FLAGS_RELEASE
+    CMAKE_CXX_FLAGS_MINSIZEREL
+    CMAKE_CXX_FLAGS_RELWITHDEBINFO
+    CMAKE_C_FLAGS
+    CMAKE_C_FLAGS_DEBUG
+    CMAKE_C_FLAGS_RELEASE
+    CMAKE_C_FLAGS_MINSIZEREL
+    CMAKE_C_FLAGS_RELWITHDEBINFO
+    CMAKE_CUDA_FLAGS
+    CMAKE_CUDA_FLAGS_DEBUG
+    CMAKE_CUDA_FLAGS_RELEASE
+    CMAKE_CUDA_FLAGS_MINSIZEREL
+    CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
+    string(REGEX REPLACE "-Wall" " /W0 " ${flag_var} "${${flag_var}}")
+  endforeach()
+endif()
 if (BUILD_MULTI_GPU)
  list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
  #list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
-  list(APPEND COMMON_LIB_DIRS /opt/mpi/lib)
+#  list(APPEND COMMON_LIB_DIRS /opt/mpi/lib)
 endif()
 if(USE_TRITONSERVER_DATATYPE)
@@ -376,7 +399,8 @@ add_library(transformer-shared SHARED
 if (BUILD_MULTI_GPU)
 target_link_libraries(transformer-shared PUBLIC
-  -lmpi
+  #-lmpi
+  ${MPI_CXX_LIBRARIES}
  ${NCCL_LIBRARIES}
 )
 endif()

--- a/README_origin.md
+++ b/README_origin.md
 <div align="center">
-  <img src="resources/lmdeploy-logo.svg" width="450"/>
+  <img src="docs/en/_static/image/lmdeploy-logo.svg" width="450"/>
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy.readthedocs.io/en/latest/)
-[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
 [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy)
 [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
 [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
 [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
+[📘Documentation](https://lmdeploy.readthedocs.io/en/latest/) |
+[🛠️Quick Start](https://lmdeploy.readthedocs.io/en/latest/get_started.html) |
+[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
 English | [简体中文](README_zh-CN.md)
-</div>
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx)
+[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm)
+[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d)
-<p align="center">
+</div>
-    👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
-</p>
 ______________________________________________________________________
-## News 🎉
+## Latest News 🎉
+<details open>
+<summary><b>2024</b></summary>
+- \[2024/03\] Support VLM offline inference pipeline and serving.
+- \[2024/02\] Support Qwen 1.5, Gemma, Mistral, Mixtral, Deepseek-MOE and so on.
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) seamless integration with [LMDeploy Serving Service](./docs/en/serving/api_server.md).
+- \[2024/01\] Support for multi-model, multi-machine, multi-card inference services. For usage instructions, please refer to [here](./docs/en/serving/proxy_server.md)
+- \[2024/01\] Support [PyTorch inference engine](./docs/en/inference/pytorch.md), developed entirely in Python, helping to lower the barriers for developers and enable  rapid experimentation with new features and technologies.
+</details>
+<details close>
+<summary><b>2023</b></summary>
- \[2023/11\] Turbomind supports loading hf model directly. Click [here](./docs/en/load_hf.md) for details.
+- \[2023/12\] Turbomind supports multimodal input. [Gradio Demo](./examples/vl/README.md)
+- \[2023/11\] Turbomind supports loading hf model directly. Click [here](docs/en/inference/load_hf.md) for details.
 - \[2023/11\] TurboMind major upgrades, including: Paged Attention, faster attention kernels without sequence length limitation, 2x faster KV8 kernels, Split-K decoding (Flash Decoding), and W4A16 inference for sm_75
 - \[2023/09\] TurboMind supports Qwen-14B
 - \[2023/09\] TurboMind supports InternLM-20B
@@ -29,75 +47,71 @@ ______________________________________________________________________
 - \[2023/08\] TurboMind supports flash-attention2.
 - \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
 - \[2023/08\] TurboMind supports Windows (tp=1)
- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
+- \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation. Check [this](docs/en/quantization/w4a16.md) guide for detailed info
 - \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.
 - \[2023/08\] LMDeploy supports 4-bit quantization using the [AWQ](https://arxiv.org/abs/2306.00978) algorithm.
 - \[2023/07\] TurboMind supports Llama-2 70B with GQA.
 - \[2023/07\] TurboMind supports Llama-2 7B/13B.
 - \[2023/07\] TurboMind supports tensor-parallel inference of InternLM.
+</details>
 ______________________________________________________________________
-## Introduction
+# Introduction
 LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by the [MMRazor](https://github.com/open-mmlab/mmrazor) and [MMDeploy](https://github.com/open-mmlab/mmdeploy) teams. It has the following core features:
- **Efficient Inference Engine (TurboMind)**: Based on [FasterTransformer](https://github.com/NVIDIA/FasterTransformer), we have implemented an efficient inference engine - TurboMind, which supports the inference of LLaMA and its variant models on NVIDIA GPUs.
+- **Efficient Inference**: LMDeploy delivers up to 1.8x higher request throughput than vLLM, by introducing key features like persistent batch(a.k.a. continuous batching), blocked KV cache, dynamic split&fuse, tensor parallelism, high-performance CUDA kernels and so on.
- **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, it remembers dialogue history, thus avoiding repetitive processing of historical sessions.
- **Multi-GPU Model Deployment and Quantization**: We provide comprehensive model deployment and quantification support, and have been validated at different scales.
- **Persistent Batch Inference**: Further optimization of model execution efficiency.
-![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e)
-## Supported Models
-`LMDeploy` has two inference backends, `Pytorch` and `TurboMind`. You can run `lmdeploy list` to check the supported model names.
-### TurboMind
+- **Effective Quantization**: LMDeploy supports weight-only and k/v quantization, and the 4-bit inference performance is 2.4x higher than FP16. The quantization quality has been confirmed via OpenCompass evaluation.
-> **Note**<br />
+- **Effortless Distribution Server**: Leveraging the request distribution service, LMDeploy facilitates an easy and efficient deployment of multi-model services across multiple machines and cards.
-> W4A16 inference requires Nvidia GPU with Ampere architecture or above.
-|    Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+- **Interactive Inference Mode**: By caching the k/v of attention during multi-round dialogue processes, the engine remembers dialogue history, thus avoiding repetitive processing of historical sessions.
-| :----------: | :-------------: | :--: | :-----: | :---: | :--: |
-|    Llama     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|    Llama2    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|    SOLAR     |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-20B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-7B    |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-14B   |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan-7B  |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan2-7B |       Yes       | Yes  |   Yes   |  Yes  |  No  |
-|  Code Llama  |       Yes       | Yes  |   No    |  No   |  No  |
-### Pytorch
+# Performance
-|   Models    | Tensor Parallel | FP16 | KV INT8 | W4A16 | W8A8 |
+![v0 1 0-benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/8e455cf1-a792-4fa8-91a2-75df96a2a5ba)
-| :---------: | :-------------: | :--: | :-----: | :---: | :--: |
-|    Llama    |       Yes       | Yes  |   No    |  No   |  No  |
-|   Llama2    |       Yes       | Yes  |   No    |  No   |  No  |
-| InternLM-7B |       Yes       | Yes  |   No    |  No   |  No  |
-## Performance
+For detailed inference benchmarks in more devices and more settings, please refer to the following link:
-**Case I**: output token throughput with fixed input token and output token number (1, 2048)
+- [A100](./docs/en/benchmark/a100_fp16.md)
+- V100
+- 4090
+- 3090
+- 2080
-**Case II**: request throughput with real conversation data
+# Supported Models
-Test Setting: LLaMA-7B, NVIDIA A100(80G)
+|       Model        |    Size    |
+| :----------------: | :--------: |
+|       Llama        |  7B - 65B  |
+|       Llama2       |  7B - 70B  |
+|      InternLM      |  7B - 20B  |
+|     InternLM2      |  7B - 20B  |
+| InternLM-XComposer |     7B     |
+|        QWen        |  7B - 72B  |
+|      QWen1.5       | 0.5B - 72B |
+|      QWen-VL       |     7B     |
+|      Baichuan      |  7B - 13B  |
+|     Baichuan2      |  7B - 13B  |
+|     Code Llama     |  7B - 34B  |
+|      ChatGLM2      |     6B     |
+|       Falcon       | 7B - 180B  |
+|         YI         |  6B - 34B  |
+|      Mistral       |     7B     |
+|    DeepSeek-MoE    |    16B     |
+|      Mixtral       |    8x7B    |
+|       Gemma        |   2B-7B    |
-The output token throughput of TurboMind exceeds 2000 tokens/s, which is about 5% - 15% higher than DeepSpeed overall and outperforms huggingface transformers by up to 2.3x.
+LMDeploy has developed two inference engines - [TurboMind](./docs/en/inference/turbomind.md) and [PyTorch](./docs/en/inference/pytorch.md), each with a different focus. The former strives for ultimate optimization of inference performance, while the latter, developed purely in Python, aims to decrease the barriers for developers.
-And the request throughput of TurboMind is 30% higher than vLLM.
-![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774)
+They differ in the types of supported models and the inference data type. Please refer to [this table](./docs/en/supported_models/supported_models.md) for each engine's capability and choose the proper one that best fits your actual needs.
-## Quick Start
+# Quick Start
-### Installation
+## Installation
 Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
@@ -105,121 +119,54 @@ Install lmdeploy with pip ( python 3.8+) or [from source](./docs/en/build.md)
 pip install lmdeploy
 ```
-> **Note**<br />
+The default prebuilt package is compiled on CUDA 11.8. However, if CUDA 12+ is required, you can install lmdeploy by:
-> `pip install lmdeploy` can only install the runtime required packages. If users want to run codes from modules like `lmdeploy.lite` and `lmdeploy.serve`, they need to install the extra required packages.
-> For instance, running `pip install lmdeploy[lite]` would install extra dependencies for `lmdeploy.lite` module.
->
-> - `all`: Install lmdeploy with all dependencies in `requirements.txt`
-> - `lite`: Install lmdeploy with extra dependencies in `requirements/lite.txt`
-> - `serve`: Install lmdeploy with dependencies in `requirements/serve.txt`
-### Deploy InternLM
-To use TurboMind inference engine, you need to first convert the model into TurboMind format. Currently, we support online conversion and offline conversion. With online conversion, TurboMind can load the Huggingface model directly. While with offline conversion, you should save the converted model first before using it.
-The following use [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) as a example to show how to use turbomind with online conversion. You can refer to [load_hf.md](docs/en/load_hf.md) for other methods.
-#### Inference by TurboMind
-```shell
-lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b
-```
-> **Note**<br /> The internlm/internlm-chat-7b model will be downloaded under `.cache` folder. You can also use a local path here.
-> **Note**<br />
-> When inferring with FP16 precision, the InternLM-7B model requires at least 15.7G of GPU memory overhead on TurboMind. <br />
-> It is recommended to use NVIDIA cards such as 3090, V100, A100, etc.
-> Disable GPU ECC can free up 10% memory, try `sudo nvidia-smi --ecc-config=0` and reboot system.
-> **Note**<br />
-> Tensor parallel is available to perform inference on multiple GPUs. Add `--tp=<num_gpu>` on `chat` to enable runtime TP.
-#### Serving with gradio
-```shell
-# install lmdeploy with extra dependencies
-pip install lmdeploy[serve]
-lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b
-```
-![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
-#### Serving with Restful API
-Launch inference server by:
-```shell
-# install lmdeploy with extra dependencies
-pip install lmdeploy[serve]
-lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
-```
-Then, you can communicate with it by command line,
 ```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
+export LMDEPLOY_VERSION=0.2.0
-lmdeploy serve api_client api_server_url
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
 ```
-or webui,
+## Offline Batch Inference
-```shell
+```python
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
+import lmdeploy
-# server_ip and server_port here are for gradio ui
+pipe = lmdeploy.pipeline("internlm/internlm-chat-7b")
-# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
+response = pipe(["Hi, pls intro yourself", "Shanghai is"])
-lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
+print(response)
 ```
-Refer to [restful_api.md](docs/en/restful_api.md) for more details.
+> \[!NOTE\]
+> By default, LMDeploy downloads model from HuggingFace. If you would like to use models from ModelScope, please install ModelScope by `pip install modelscope` and set the environment variable:
-### Inference with PyTorch
+>
+> `export LMDEPLOY_USE_MODELSCOPE=True`
-For detailed instructions on Inference pytorch models, see [here](docs/en/pytorch.md).
-#### Single GPU
-```shell
-lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
-```
-#### Tensor Parallel with DeepSpeed
-```shell
-deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
-    $NAME_OR_PATH_TO_HF_MODEL \
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
-```
-You need to install deepspeed first to use this feature.
-```
-pip install deepspeed
-```
-## Quantization
+For more information about inference pipeline, please refer to [here](./docs/en/inference/pipeline.md).
-#### Weight INT4 Quantization
+# Tutorials
-LMDeploy uses [AWQ](https://arxiv.org/abs/2306.00978) algorithm for model weight quantization
+Please overview [getting_started](./docs/en/get_started.md) section for the basic usage of LMDeploy.
-[Click here](./docs/en/w4a16.md) to view the test results for weight int4 usage.
+For detailed user guides and advanced guides, please refer to our [tutorials](https://lmdeploy.readthedocs.io/en/latest/):
-#### KV Cache INT8 Quantization
+- User Guide
+  - [LLM Inference pipeline](./docs/en/inference/pipeline.md)
+  - [VLM Inference pipeline](./docs/en/inference/vl_pipeline.md)
+  - [LLM Serving](docs/en/serving/api_server.md)
+  - [VLM Serving](docs/en/serving/api_server_vl.md)
+  - [Quantization](docs/en/quantization)
+- Advance Guide
+  - [Inference Engine - TurboMind](docs/en/inference/turbomind.md)
+  - [Inference Engine - PyTorch](docs/en/inference/pytorch.md)
+  - [Customize chat templates](docs/en/advance/chat_template.md)
+  - [Add a new model](docs/en/advance/pytorch_new_model.md)
+  - gemm tuning
+  - [Long context inference](docs/en/advance/long_context.md)
+  - [Multi-model inference service](docs/en/serving/proxy_server.md)
-[Click here](./docs/en/kv_int8.md) to view the usage method, implementation formula, and test results for kv int8.
+# Third-party projects
-> **Warning**<br />
+- Deploying LLMs offline on the NVIDIA Jetson platform by LMDeploy: [LMDeploy-Jetson](https://github.com/BestAnHongjun/LMDeploy-Jetson)
-> runtime Tensor Parallel for quantized model is not available. Please setup `--tp` on `deploy` to enable static TP.
 ## Contributing
@@ -229,6 +176,8 @@ We appreciate all contributions to LMDeploy. Please refer to [CONTRIBUTING.md](.
 - [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
 - [llm-awq](https://github.com/mit-han-lab/llm-awq)
+- [vLLM](https://github.com/vllm-project/vllm)
+- [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)
 ## License

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
 <div align="center">
-  <img src="resources/lmdeploy-logo.svg" width="450"/>
+  <img src="docs/en/_static/image/lmdeploy-logo.svg" width="450"/>
-[![docs](https://img.shields.io/badge/docs-latest-blue)](https://lmdeploy-zh-cn.readthedocs.io/zh_CN/latest/)
-[![badge](https://github.com/InternLM/lmdeploy/workflows/lint/badge.svg)](https://github.com/InternLM/lmdeploy/actions)
 [![PyPI](https://img.shields.io/pypi/v/lmdeploy)](https://pypi.org/project/lmdeploy)
+![PyPI - Downloads](https://img.shields.io/pypi/dm/lmdeploy)
 [![license](https://img.shields.io/github/license/InternLM/lmdeploy.svg)](https://github.com/InternLM/lmdeploy/tree/main/LICENSE)
 [![issue resolution](https://img.shields.io/github/issues-closed-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
 [![open issues](https://img.shields.io/github/issues-raw/InternLM/lmdeploy)](https://github.com/InternLM/lmdeploy/issues)
+[📘Documentation](https://lmdeploy.readthedocs.io/zh-cn/latest/) |
+[🛠️Quick Start](https://lmdeploy.readthedocs.io/zh-cn/latest/get_started.html) |
+[🤔Reporting Issues](https://github.com/InternLM/lmdeploy/issues/new/choose)
 [English](README.md) | 简体中文
-</div>
+👋 join us on [![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=wechat&label=WeChat)](https://r.vansin.top/?r=internwx)
+[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=twitter&label=Twitter)](https://twitter.com/intern_lm)
+[![Static Badge](https://img.shields.io/badge/-grey?style=social&logo=discord&label=Discord)](https://discord.gg/xa29JuW87d)
-<p align="center">
+</div>
-    👋 join us on <a href="https://twitter.com/intern_lm" target="_blank">Twitter</a>, <a href="https://discord.gg/xa29JuW87d" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=internwx" target="_blank">WeChat</a>
-</p>
 ______________________________________________________________________
-## 更新 🎉
+## 最新进展 🎉
+<details open>
+<summary><b>2024</b></summary>
+- \[2024/03\] 支持视觉-语言模型（VLM）的离线推理 pipeline 和推理服务
+- \[2024/02\] 支持 Qwen 1.5、Gemma、Mistral、Mixtral、Deepseek-MOE 等模型
+- \[2024/01\] [OpenAOE](https://github.com/InternLM/OpenAOE) 发布，支持无缝接入[LMDeploy Serving Service](./docs/zh_cn/serving/api_server.md)
+- \[2024/01\] 支持多模型、多机、多卡推理服务。使用方法请参考[此处](./docs/zh_cn/serving/proxy_server.md)
+- \[2024/01\] 增加 [PyTorch 推理引擎](./docs/zh_cn/inference/pytorch.md)，作为 TurboMind 引擎的补充。帮助降低开发门槛，和快速实验新特性、新技术
+</details>
+<details close>
+<summary><b>2023</b></summary>
- \[2023/11\] Turbomind 支持直接读取 Huggingface 模型。点击[这里](./docs/en/load_hf.md)查看使用方法
+- \[2023/12\] Turbomind 支持多模态输入。[Gradio Demo](./examples/vl/README.md)
+- \[2023/11\] Turbomind 支持直接读取 Huggingface 模型。点击[这里](docs/zh_cn/inference/load_hf.md)查看使用方法
 - \[2023/11\] TurboMind 重磅升级。包括：Paged Attention、更快的且不受序列最大长度限制的 attention kernel、2+倍快的 KV8 kernels、Split-K decoding (Flash Decoding) 和 支持 sm_75 架构的 W4A16
 - \[2023/09\] TurboMind 支持 Qwen-14B
 - \[2023/09\] TurboMind 支持 InternLM-20B 模型
@@ -29,76 +47,72 @@ ______________________________________________________________________
 - \[2023/08\] TurboMind 支持 flash-attention2
 - \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
 - \[2023/08\] TurboMind 支持 Windows (tp=1)
- \[2023/08\] TurboMind 支持 4-bit 推理，速度是 FP16 的 2.4 倍，是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md)
+- \[2023/08\] TurboMind 支持 4-bit 推理，速度是 FP16 的 2.4 倍，是目前最快的开源实现。部署方式请看[这里](docs/zh_cn/quantization/w4a16.md)
 - \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ，提供开箱即用的 4-bit 模型
 - \[2023/08\] LMDeploy 支持使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法进行 4-bit 量化
 - \[2023/07\] TurboMind 支持使用 GQA 的 Llama-2 70B 模型
 - \[2023/07\] TurboMind 支持 Llama-2 7B/13B 模型
 - \[2023/07\] TurboMind 支持 InternLM 的 Tensor Parallel 推理
+</details>
 ______________________________________________________________________
-## 简介
+# 简介
 LMDeploy 由 [MMDeploy](https://github.com/open-mmlab/mmdeploy) 和 [MMRazor](https://github.com/open-mmlab/mmrazor) 团队联合开发，是涵盖了 LLM 任务的全套轻量化、部署和服务解决方案。
 这个强大的工具箱提供以下核心功能：
- **高效推理引擎 TurboMind**：基于 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)，我们实现了高效推理引擎 TurboMind，支持 InternLM、LLaMA、vicuna等模型在 NVIDIA GPU 上的推理。
+- **高效的推理**：LMDeploy 开发了 Persistent Batch(即 Continuous Batch)，Blocked K/V Cache，动态拆分和融合，张量并行，高效的计算 kernel等重要特性。推理性能是 vLLM 的 1.8 倍
- **交互推理方式**：通过缓存多轮对话过程中 attention 的 k/v，记住对话历史，从而避免重复处理历史会话。
+- **可靠的量化**：LMDeploy 支持权重量化和 k/v 量化。4bit 模型推理效率是 FP16 下的 2.4 倍。量化模型的可靠性已通过 OpenCompass 评测得到充分验证。
- **多 GPU 部署和量化**：我们提供了全面的模型部署和量化支持，已在不同规模上完成验证。
+- **便捷的服务**：通过请求分发服务，LMDeploy 支持多模型在多机、多卡上的推理服务。
- **persistent batch 推理**：进一步优化模型执行效率。
+- **有状态推理**：通过缓存多轮对话过程中 attention 的 k/v，记住对话历史，从而避免重复处理历史会话。显著提升长文本多轮对话场景中的效率。
-  ![PersistentBatchInference](https://github.com/InternLM/lmdeploy/assets/67539920/e3876167-0671-44fc-ac52-5a0f9382493e)
+# 性能
-## 支持的模型
+LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型上，每秒处理的请求数是 vLLM 的 1.36 ~ 1.85 倍。在静态推理能力方面，TurboMind 4bit 模型推理速度（out token/s）远高于 FP16/BF16 推理。在小 batch 时，提高到 2.4 倍。
-`LMDeploy` 支持 `TurboMind` 和 `Pytorch` 两种推理后端。运行`lmdeploy list`可查看支持模型列表
+![v0 1 0-benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/8e455cf1-a792-4fa8-91a2-75df96a2a5ba)
-### TurboMind
+更多设备、更多计算精度、更多setting下的的推理 benchmark，请参考以下链接：
-> **Note**<br />
+- [A100](./docs/en/benchmark/a100_fp16.md)
-> W4A16 推理需要 Ampere 及以上架构的 Nvidia GPU
+- 4090
+- 3090
+- 2080
-|     模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+# 支持的模型
-| :----------: | :------: | :--: | :-----: | :---: | :--: |
-|    Llama     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|    Llama2    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|    SOLAR     |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| InternLM-20B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-7B    |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|   QWen-14B   |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan-7B  |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-| Baichuan2-7B |   Yes    | Yes  |   Yes   |  Yes  |  No  |
-|  Code Llama  |   Yes    | Yes  |   No    |  No   |  No  |
-### Pytorch
+|       Model        |    Size    |
+| :----------------: | :--------: |
+|       Llama        |  7B - 65B  |
+|       Llama2       |  7B - 70B  |
+|      InternLM      |  7B - 20B  |
+|     InternLM2      |  7B - 20B  |
+| InternLM-XComposer |     7B     |
+|        QWen        |  7B - 72B  |
+|      QWen-VL       |     7B     |
+|      QWen1.5       | 0.5B - 72B |
+|      Baichuan      |  7B - 13B  |
+|     Baichuan2      |  7B - 13B  |
+|     Code Llama     |  7B - 34B  |
+|      ChatGLM2      |     6B     |
+|       Falcon       | 7B - 180B  |
+|         YI         |  6B - 34B  |
+|      Mistral       |     7B     |
+|    DeepSeek-MoE    |    16B     |
+|      Mixtral       |    8x7B    |
+|       Gemma        |   2B-7B    |
-|    模型     | 模型并行 | FP16 | KV INT8 | W4A16 | W8A8 |
+LMDeploy 支持 2 种推理引擎： [TurboMind](./docs/zh_cn/inference/turbomind.md) 和 [PyTorch](./docs/zh_cn/inference/pytorch.md)，它们侧重不同。前者追求推理性能的极致优化，后者纯用python开发，着重降低开发者的门槛。
-| :---------: | :------: | :--: | :-----: | :---: | :--: |
-|    Llama    |   Yes    | Yes  |   No    |  No   |  No  |
-|   Llama2    |   Yes    | Yes  |   No    |  No   |  No  |
-| InternLM-7B |   Yes    | Yes  |   No    |  No   |  No  |
-## 性能
+它们在支持的模型类别、计算精度方面有所差别。用户可参考[这里](./docs/zh_cn/supported_models/supported_models.md), 查阅每个推理引擎的能力，并根据实际需求选择合适的。
-**场景一**: 固定的输入、输出token数（1,2048），测试 output token throughput
+# 快速开始
-**场景二**: 使用真实数据，测试 request throughput
+## 安装
-测试配置：LLaMA-7B, NVIDIA A100(80G)
-TurboMind 的 output token throughput 超过 2000 token/s, 整体比 DeepSpeed 提升约 5% - 15%，比 huggingface transformers 提升 2.3 倍
-在 request throughput 指标上，TurboMind 的效率比 vLLM 高 30%
-![benchmark](https://github.com/InternLM/lmdeploy/assets/4560679/7775c518-608e-4e5b-be73-7645a444e774)
-## 快速上手
-### 安装
 使用 pip ( python 3.8+) 安装 LMDeploy，或者[源码安装](./docs/zh_cn/build.md)
@@ -106,117 +120,54 @@ TurboMind 的 output token throughput 超过 2000 token/s, 整体比 DeepSpeed 
 pip install lmdeploy
 ```
-> **Note**<br />
+LMDeploy的预编译包默认是基于 CUDA 11.8 编译的。如果需要在 CUDA 12+ 下安装 LMDeploy，请执行以下命令：
-> `pip install lmdeploy`默认安装runtime依赖包，使用lmdeploy的lite和serve功能时，用户需要安装额外依赖包。例如: `pip install lmdeploy[lite]` 会额外安装`lmdeploy.lite`模块的依赖包
->
-> - `all`: 安装`lmdeploy`所有依赖包，具体可查看`requirements.txt`
-> - `lite`: 额外安装`lmdeploy.lite`模块的依赖包，具体可查看`requirements/lite.txt`
-> - `serve`: 额外安装`lmdeploy.serve`模块的依赖包，具体可查看`requirements/serve.txt`
-### 部署 InternLM
-使用 TurboMind 推理模型需要先将模型转化为 TurboMind 的格式，目前支持在线转换和离线转换两种形式。在线转换可以直接加载 Huggingface 模型，离线转换需需要先保存模型再加载。
-下面以 [internlm/internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) 为例，展示在线转换的使用方式。其他方式可参考[load_hf.md](docs/zh_cn/load_hf.md)
-#### 使用 turbomind 推理
 ```shell
-lmdeploy chat turbomind internlm/internlm-chat-7b --model-name internlm-chat-7b
+export LMDEPLOY_VERSION=0.2.0
+export PYTHON_VERSION=38
+pip install https://github.com/InternLM/lmdeploy/releases/download/v${LMDEPLOY_VERSION}/lmdeploy-${LMDEPLOY_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux2014_x86_64.whl
 ```
-> **Note**<br /> internlm/internlm-chat-7b 会自动下载到 `.cache` 文件夹，这里也可以传下载好的路径。
+## 离线批处理
-> **Note**<br />
-> turbomind 在使用 FP16 精度推理 InternLM-7B 模型时，显存开销至少需要 15.7G。建议使用 3090, V100，A100等型号的显卡。<br />
-> 关闭显卡的 ECC 可以腾出 10% 显存，执行 `sudo nvidia-smi --ecc-config=0` 重启系统生效。
-> **Note**<br />
-> 使用 Tensor 并发可以利用多张 GPU 进行推理。在 `chat` 时添加参数 `--tp=<num_gpu>` 可以启动运行时 TP。
-#### 启动 gradio server
-```shell
+```python
-# 安装lmdeploy额外依赖
+import lmdeploy
-pip install lmdeploy[serve]
+pipe = lmdeploy.pipeline("internlm/internlm-chat-7b")
+response = pipe(["Hi, pls intro yourself", "Shanghai is"])
-lmdeploy serve gradio internlm/internlm-chat-7b --model-name internlm-chat-7b
+print(response)
 ```
-![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
+> \[!NOTE\]
+> LMDeploy 默认从 HuggingFace 上面下载模型，如果要从 ModelScope 上面下载模型，请通过命令 `pip install modelscope` 安装ModelScope，并设置环境变量：
-#### 通过 Restful API 部署服务
+>
+> `export LMDEPLOY_USE_MODELSCOPE=True`
-使用下面的命令启动推理服务：
-```shell
-# 安装lmdeploy额外依赖
-pip install lmdeploy[serve]
-lmdeploy serve api_server internlm/internlm-chat-7b --model-name internlm-chat-7b --instance_num 32 --tp 1
-```
-你可以通过命令行方式与推理服务进行对话：
-```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
-lmdeploy serve api_client api_server_url
-```
-也可以通过 WebUI 方式来对话：
-```shell
-# api_server_url is what printed in api_server.py, e.g. http://localhost:23333
-# server_ip and server_port here are for gradio ui
-# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006
-lmdeploy serve gradio api_server_url --server_name ${gradio_ui_ip} --server_port ${gradio_ui_port}
-```
-更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
-### 基于 PyTorch 的推理
-你必须确保环境中有安装 deepspeed：
-```
-pip install deepspeed
-```
-#### 单个 GPU
-```shell
-lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
-```
-#### 使用 DeepSpeed 实现张量并行
-```shell
-deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
-    $NAME_OR_PATH_TO_HF_MODEL \
-    --max_new_tokens 64 \
-    --temperture 0.8 \
-    --top_p 0.95 \
-    --seed 0
-```
-## 量化部署
+关于 pipeline 的更多推理参数说明，请参考[这里](./docs/zh_cn/inference/pipeline.md)
-#### 权重 INT4 量化
+# 用户教程
-LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进行量化
+请阅读[快速上手](./docs/zh_cn/get_started.md)章节，了解 LMDeploy 的基本用法。
-[点击这里](./docs/zh_cn/w4a16.md) 查看 weight int4 用法测试结果。
+为了帮助用户更进一步了解 LMDeploy，我们准备了用户指南和进阶指南，请阅读我们的[文档](https://lmdeploy.readthedocs.io/zh-cn/latest/)：
-#### KV Cache INT8 量化
+- 用户指南
+  - [LLM 推理 pipeline](./docs/zh_cn/inference/pipeline.md)
+  - [VLM 推理 pipeline](./docs/zh_cn/inference/vl_pipeline.md)
+  - [LLM 推理服务](./docs/zh_cn/serving/api_server.md)
+  - [VLM 推理服务](./docs/zh_cn/serving/api_server_vl.md)
+  - [模型量化](./docs/zh_cn/quantization)
+- 进阶指南
+  - [推理引擎 - TurboMind](./docs/zh_cn/inference/turbomind.md)
+  - [推理引擎 - PyTorch](./docs/zh_cn/inference/pytorch.md)
+  - [自定义对话模板](./docs/zh_cn/advance/chat_template.md)
+  - [支持新模型](./docs/zh_cn/advance/pytorch_new_model.md)
+  - gemm tuning
+  - [长文本推理](./docs/zh_cn/advance/long_context.md)
+  - [多模型推理服务](./docs/zh_cn/serving/proxy_server.md)
-[点击这里](./docs/zh_cn/kv_int8.md) 查看 kv int8 使用方法、实现公式和测试结果。
+# 社区项目
-> **Warning**<br />
+- 使用LMDeploy在英伟达Jetson系列板卡部署大模型：[LMDeploy-Jetson](https://github.com/BestAnHongjun/LMDeploy-Jetson)
-> 量化部署不支持运行时 Tensor 并发。如果希望使用 Tensor 并发，需要在 deploy 时配置 tp 参数。
 ## 贡献指南
@@ -226,6 +177,8 @@ LMDeploy 使用 [AWQ](https://arxiv.org/abs/2306.00978) 算法对模型权重进
 - [FasterTransformer](https://github.com/NVIDIA/FasterTransformer)
 - [llm-awq](https://github.com/mit-han-lab/llm-awq)
+- [vLLM](https://github.com/vllm-project/vllm)
+- [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)
 ## License

--- a/benchmark/benchmark_13b.sh
+++ b/benchmark/benchmark_13b.sh
-#!/bin/bash
-if [ -z "$1" ]
-then
-    echo "Error. Please input the model path of llama2-13b model"
-    exit 1
-fi
-workspace_dir=$(dirname $(realpath "$0"))
-tp=1
-model_path="$1"
-model_foldername=$(basename "$model_path")
-turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
-# convert
-lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
-if [ $? != 0 ]
-then
-    exit 1
-fi
-# update recommended config to config.ini
-config_path=${turbomind_model_path}/triton_models/weights/config.ini
-apt-get update
-apt-get install crudini -y
-crudini --set ${config_path} llama max_context_token_num 4
-crudini --set ${config_path} llama cache_chunk_size -1
-crudini --set ${config_path} llama cache_max_entry_count 500
-crudini --set ${config_path} llama max_batch_size 128
-# end of update config
-cd ${workspace_dir}
-# download dataset
-wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-benchmark_rpm () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    batches=(64 128)
-    for batch in "${batches[@]}"
-    do
-        for i in {1..3}
-        do
-        python3 profile_throughput.py \
-            ShareGPT_V3_unfiltered_cleaned_split.json \
-            ${turbomind_model_path} \
-            --concurrency "$batch" \
-            --num_prompts 3000 \
-            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
-        done
-    done
-}
-benchmark_generation () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    python3 profile_generation.py \
-        ${turbomind_model_path} \
-        --concurrency 1 16 32 64 \
-        --csv ${output_path}/generation.csv
-}
-################################# BENCHMARK AFTER TUNING GEMM #################################
-# tune gemm
-head_num=$(crudini --get "${config_path}" llama head_num)
-size_per_head=$(crudini --get "${config_path}" llama size_per_head)
-vocab_size=$(crudini --get "${config_path}" llama vocab_size)
-inter_size=$(crudini --get "${config_path}" llama inter_size)
-tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
-max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
-echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
-python3 -m lmdeploy.turbomind.generate_gemm_config \
-    --head_num ${head_num} \
-    --size_per_head ${size_per_head} \
-    --vocab_size ${vocab_size} \
-    --inter_size ${inter_size} \
-    --tensor_para_size ${tensor_para_size} \
-    --max_batch_size ${max_batch_size}
-output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
-# benchmark request throughput and static inference
-benchmark_rpm ${output_path}
-benchmark_generation ${output_path}
-mv gemm_config.in ${output_path}
--- a/benchmark/benchmark_20b.sh
+++ b/benchmark/benchmark_20b.sh
-#!/bin/bash
-if [ -z "$1" ]
-then
-    echo "Error. Please input the model path of internlm-20b model"
-    exit 1
-fi
-workspace_dir=$(dirname $(realpath "$0"))
-tp=2
-model_path="$1"
-model_foldername=$(basename "$model_path")
-turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
-# convert
-lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
-if [ $? != 0 ]
-then
-    exit 1
-fi
-# update recommended config to config.ini
-config_path=${turbomind_model_path}/triton_models/weights/config.ini
-apt-get update
-apt-get install crudini -y
-crudini --set ${config_path} llama max_context_token_num 4
-crudini --set ${config_path} llama cache_chunk_size -1
-crudini --set ${config_path} llama cache_max_entry_count 700
-crudini --set ${config_path} llama max_batch_size 128
-# end of update config
-cd ${workspace_dir}
-# download dataset
-wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-benchmark_rpm () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    batches=(64 128)
-    for batch in "${batches[@]}"
-    do
-        for i in {1..3}
-        do
-        python3 profile_throughput.py \
-            ShareGPT_V3_unfiltered_cleaned_split.json \
-            ${turbomind_model_path} \
-            --concurrency "$batch" \
-            --num_prompts 3000 \
-            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
-        done
-    done
-}
-benchmark_generation () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    python3 profile_generation.py \
-        ${turbomind_model_path} \
-        --concurrency 1 16 32 64 \
-        --csv ${output_path}/generation.csv
-}
-################################# BENCHMARK AFTER TUNING GEMM #################################
-# tune gemm
-head_num=$(crudini --get "${config_path}" llama head_num)
-size_per_head=$(crudini --get "${config_path}" llama size_per_head)
-vocab_size=$(crudini --get "${config_path}" llama vocab_size)
-inter_size=$(crudini --get "${config_path}" llama inter_size)
-tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
-max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
-echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
-python3 -m lmdeploy.turbomind.generate_gemm_config \
-    --head_num ${head_num} \
-    --size_per_head ${size_per_head} \
-    --vocab_size ${vocab_size} \
-    --inter_size ${inter_size} \
-    --tensor_para_size ${tensor_para_size} \
-    --max_batch_size ${max_batch_size}
-output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
-# benchmark request throughput and static inference
-benchmark_rpm ${output_path}
-benchmark_generation ${output_path}
-cp gemm_config.in ${output_path}
--- a/benchmark/benchmark_70b.sh
+++ b/benchmark/benchmark_70b.sh
-#!/bin/bash
-if [ -z "$1" ]
-then
-    echo "Error. Please input the model path of llama2-70b model"
-    exit 1
-fi
-workspace_dir=$(dirname $(realpath "$0"))
-tp=4
-model_path="$1"
-model_foldername=$(basename "$model_path")
-turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
-# convert
-lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
-if [ $? != 0 ]
-then
-    exit 1
-fi
-# update recommended config to config.ini
-config_path=${turbomind_model_path}/triton_models/weights/config.ini
-apt-get update
-apt-get install crudini -y
-crudini --set ${config_path} llama max_context_token_num 4
-crudini --set ${config_path} llama cache_chunk_size -1
-crudini --set ${config_path} llama cache_max_entry_count 4000
-crudini --set ${config_path} llama max_batch_size 256
-# end of update config
-cd ${workspace_dir}
-# download dataset
-wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-benchmark_rpm () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    batches=(64 128 256)
-    for batch in "${batches[@]}"
-    do
-        for i in {1..3}
-        do
-        python3 profile_throughput.py \
-            ShareGPT_V3_unfiltered_cleaned_split.json \
-            ${turbomind_model_path} \
-            --concurrency "$batch" \
-            --num_prompts 3000 \
-            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
-        done
-    done
-}
-benchmark_generation () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    python3 profile_generation.py \
-        ${turbomind_model_path} \
-        --concurrency 1 64 128 256 \
-        --csv ${output_path}/generation.csv
-}
-output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
-# benchmark request throughput and static inference
-benchmark_rpm ${output_path}
-benchmark_generation  ${output_path}
--- a/benchmark/benchmark_7b.sh
+++ b/benchmark/benchmark_7b.sh
-#!/bin/bash
-if [ -z "$1" ]
-then
-    echo "Error. Please input the model path of llama2-7b model"
-    exit 1
-fi
-workspace_dir=$(dirname $(realpath "$0"))
-tp=1
-model_path="$1"
-model_foldername=$(basename "$model_path")
-turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
-# convert
-lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
-if [ $? != 0 ]
-then
-exit 1
-fi
-# update recommended config to config.ini
-config_path=${turbomind_model_path}/triton_models/weights/config.ini
-apt-get update
-apt-get install crudini -y
-crudini --set ${config_path} llama max_context_token_num 4
-crudini --set ${config_path} llama cache_chunk_size -1
-crudini --set ${config_path} llama cache_max_entry_count 1000
-crudini --set ${config_path} llama max_batch_size 128
-# end of update config
-cd ${workspace_dir}
-# download dataset
-wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-benchmark_rpm () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    batches=(64 128)
-    for batch in "${batches[@]}"
-    do
-        for i in {1..3}
-        do
-        python3 profile_throughput.py \
-            ShareGPT_V3_unfiltered_cleaned_split.json \
-            ${turbomind_model_path} \
-            --concurrency "$batch" \
-            --num_prompts 3000 \
-            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
-        done
-    done
-}
-benchmark_generation () {
-    output_path=$1
-    mkdir -p "${output_path}"
-    python3 profile_generation.py \
-        ${turbomind_model_path} \
-        --concurrency 1 16 32 64 \
-        --csv ${output_path}/generation.csv
-}
-################################# BENCHMARK AFTER TUNING GEMM #################################
-output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
-# tune gemm
-head_num=$(crudini --get "${config_path}" llama head_num)
-size_per_head=$(crudini --get "${config_path}" llama size_per_head)
-vocab_size=$(crudini --get "${config_path}" llama vocab_size)
-inter_size=$(crudini --get "${config_path}" llama inter_size)
-tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
-max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
-echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
-python3 -m lmdeploy.turbomind.generate_gemm_config \
-    --head_num ${head_num} \
-    --size_per_head ${size_per_head} \
-    --vocab_size ${vocab_size} \
-    --inter_size ${inter_size} \
-    --tensor_para_size ${tensor_para_size} \
-    --max_batch_size ${max_batch_size}
-# benchmark request throughput and static inference
-benchmark_rpm ${output_path}
-benchmark_generation ${output_path}
-mv gemm_config.in ${output_path}
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 import csv
-import logging
 import os
 import time
 from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
-from typing import List
+from typing import List, Union
 import numpy as np
-# from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
+from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
-#                     nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
+                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
-#                     nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
+                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
-#                     nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
+                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
 from tqdm import tqdm
-from lmdeploy.turbomind import TurboMind
+from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
+from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+                               TurbomindEngineConfig)
-def infer(model, session_id: int, input_ids: List, output_seqlen: int,
+def infer(model, session_id: int, input_ids: List,
-          top_k: int, top_p: float, temperature: float, test_round: int,
+          gen_config: EngineGenerationConfig, test_round: int, que: Queue):
-          que: Queue):
    if session_id == 1:
        pbar = tqdm(total=test_round)
    chatbot = model.create_instance()
+    output_seqlen = gen_config.max_new_tokens
    stats = []
    for _ in range(test_round):
        token_latency_stats = [0] * (output_seqlen + 1)
@@ -44,20 +45,19 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
        """   # noqa: E501
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
-                                            request_output_len=output_seqlen,
+                                            gen_config=gen_config,
                                            sequence_start=True,
                                            sequence_end=True,
-                                            ignore_eos=True,
+                                            stream_output=True):
-                                            stream_output=True,
+            _, res, n_token = outputs
-                                            top_k=top_k,
-                                            top_p=top_p,
-                                            temperature=temperature):
-            _, n_token = outputs[0]
            now = time.perf_counter()
            if n_prev_token != n_token:
                token_latency_stats[n_prev_token] = np.round(now - prev, 3)
                n_prev_token = n_token
            prev = now
+        # for pytorch engine to restart a session
+        if hasattr(chatbot, 'end'):
+            chatbot.end(session_id)
        if session_id == 1:
            pbar.update(1)
@@ -68,12 +68,13 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
    que.put((session_id, stats))
-def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
+def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int,
-           warmup_round: int):
+           gen_config: EngineGenerationConfig):
    if not warmup_round:
        return
    print('start to warmup ...')
+    output_seqlen = gen_config.max_new_tokens
    def _infer(model, session_id):
        chatbot = model.create_instance()
@@ -84,15 +85,16 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True,
-                                          top_k=1,
+                                          gen_config=gen_config):
-                                          top_p=1.0,
-                                          temperature=1.0):
                continue
+            # for pytorch engine to restart a session
+            if hasattr(chatbot, 'end'):
+                chatbot.end(session_id)
    _start = time.perf_counter()
    procs = []
    for i in range(concurrency):
-        proc = Thread(target=_infer, args=(model, i + 1))
+        proc = Thread(target=_infer, args=(model, i + 1), daemon=True)
        procs.append(proc)
        proc.start()
@@ -104,25 +106,27 @@ def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
 def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
-                       output_seqlen: int, tp: int, top_k: int, top_p: float,
+                       engine_config: Union[PytorchEngineConfig,
-                       temperature: float, test_round: int, warmup_round: int,
+                                            TurbomindEngineConfig],
-                       **kwargs):
+                       gen_config: EngineGenerationConfig, test_round: int,
+                       warmup_round: int):
+    output_seqlen = gen_config.max_new_tokens
    print(f'profiling ... concurrency: {concurrency}, '
          f'n_prompt_token: {input_seqlen}, '
          f'n_completion_token: {output_seqlen}, '
          f'test_round: {test_round}, warmup_round: {warmup_round}')
+    if isinstance(engine_config, TurbomindEngineConfig):
-    # avoid turbomind checking chat template name by setting `model_name='llama'` # noqa
+        from lmdeploy.turbomind import TurboMind
-    tm_model = TurboMind(model_path=model_path,
+        tm_model = TurboMind.from_pretrained(model_path,
-                         tp=tp,
+                                             engine_config=engine_config)
-                         model_name='llama',
+    elif isinstance(engine_config, PytorchEngineConfig):
-                         **kwargs)
+        from lmdeploy.pytorch.engine import Engine
+        tm_model = Engine(model_path, engine_config)
    # make up a dummy `input_ids` with the length of `input_seqlen` exactly
    assert input_seqlen > 0, 'input_seqlen should > 0'
    input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
-    warmup(tm_model, concurrency, input_ids, output_seqlen, warmup_round)
+    warmup(tm_model, concurrency, input_ids, warmup_round, gen_config)
    que = Queue()
    procs = []
@@ -130,8 +134,8 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
    for i in range(concurrency):
        proc = Thread(target=infer,
-                      args=(tm_model, i + 1, input_ids, output_seqlen, top_k,
+                      args=(tm_model, i + 1, input_ids, gen_config, test_round,
-                            top_p, temperature, test_round, que))
+                            que))
        procs.append(proc)
        proc.start()
@@ -186,76 +190,76 @@ def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
        percentiles, throughput, tm_model.gpu_count
-# class MemoryMonitor:
+class MemoryMonitor:
-#     from multiprocessing import Manager
+    from multiprocessing import Manager
-#     max_mem = Manager().Value('f', 0)  # GB
+    max_mem = Manager().Value('f', 0)  # GB
-#     device_count = Manager().Value('f', 0)
+    device_count = Manager().Value('f', 0)
-#     @staticmethod
+    @staticmethod
-#     def nvidia_info():
+    def nvidia_info():
-#         # pip install nvidia-ml-py
+        # pip install nvidia-ml-py
-#         nvidia_dict = {
+        nvidia_dict = {
-#             'state': True,
+            'state': True,
-#             'nvidia_version': '',
+            'nvidia_version': '',
-#             'nvidia_count': 0,
+            'nvidia_count': 0,
-#             'gpus': []
+            'gpus': []
-#         }
+        }
-#         try:
+        try:
-#             nvmlInit()
+            nvmlInit()
-#             nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
+            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
-#             nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
+            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
-#             for i in range(nvidia_dict['nvidia_count']):
+            for i in range(nvidia_dict['nvidia_count']):
-#                 handle = nvmlDeviceGetHandleByIndex(i)
+                handle = nvmlDeviceGetHandleByIndex(i)
-#                 memory_info = nvmlDeviceGetMemoryInfo(handle)
+                memory_info = nvmlDeviceGetMemoryInfo(handle)
-#                 gpu = {
+                gpu = {
-#                     'gpu_name': nvmlDeviceGetName(handle),
+                    'gpu_name': nvmlDeviceGetName(handle),
-#                     'total': memory_info.total,
+                    'total': memory_info.total,
-#                     'free': memory_info.free,
+                    'free': memory_info.free,
-#                     'used': memory_info.used,
+                    'used': memory_info.used,
-#                     'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
+                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
-#                     'powerStatus': nvmlDeviceGetPowerState(handle)
+                    'powerStatus': nvmlDeviceGetPowerState(handle)
-#                 }
+                }
-#                 nvidia_dict['gpus'].append(gpu)
+                nvidia_dict['gpus'].append(gpu)
-#         except NVMLError as _:  # noqa
+        except NVMLError as _:  # noqa
-#             nvidia_dict['state'] = False
+            nvidia_dict['state'] = False
-#         except Exception as _:  # noqa
+        except Exception as _:  # noqa
-#             nvidia_dict['state'] = False
+            nvidia_dict['state'] = False
-#         finally:
+        finally:
-#             try:
+            try:
-#                 nvmlShutdown()
+                nvmlShutdown()
-#             except:  # noqa
+            except:  # noqa
-#                 pass
+                pass
-#         return nvidia_dict
+        return nvidia_dict
-#     @classmethod
+    @classmethod
-#     def mem_monitor(cls):
+    def mem_monitor(cls):
-#         info = cls.nvidia_info()
+        info = cls.nvidia_info()
-#         max_mem = 0
+        max_mem = 0
-#         mem_start = 0
+        mem_start = 0
-#         cls.device_count.value = len(info['gpus'])
+        cls.device_count.value = len(info['gpus'])
-#         for used_total in info['gpus']:
+        for used_total in info['gpus']:
-#             mem_start += used_total['used']
+            mem_start += used_total['used']
-#         while True:
+        while True:
-#             info = cls.nvidia_info()
+            info = cls.nvidia_info()
-#             used = 0
+            used = 0
-#             for used_total in info['gpus']:
+            for used_total in info['gpus']:
-#                 used += used_total['used']
+                used += used_total['used']
-#             if used > max_mem:
+            if used > max_mem:
-#                 max_mem = used
+                max_mem = used
-#                 cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
+                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
-#     @classmethod
+    @classmethod
-#     def start(cls):
+    def start(cls):
-#         cls._running = True
+        cls._running = True
-#         from multiprocessing import Process
+        from multiprocessing import Process
-#         cls.proc = Process(target=cls.mem_monitor)
+        cls.proc = Process(target=cls.mem_monitor, daemon=True)
-#         cls.proc.start()
+        cls.proc.start()
-#     @classmethod
+    @classmethod
-#     def terminate(cls) -> float:
+    def terminate(cls) -> float:
-#         """Terminate the subprocess and return maximum memory."""
+        """Terminate the subprocess and return maximum memory."""
-#         cls.proc.kill()
+        cls.proc.kill()
-#         return cls.max_mem.value
+        return cls.max_mem.value
 @dataclass
@@ -274,66 +278,97 @@ class ProfileResult:
 def parse_args():
-    parser = argparse.ArgumentParser(description='Regression Test')
+    parser = argparse.ArgumentParser(
+        description='Profile the token generation performance with'
+        ' pytorch or turbomind engine',
+        formatter_class=DefaultsAndTypesHelpFormatter)
    parser.add_argument('model_path',
                        type=str,
                        help='the path of the model in localhost or '
                        'the repo_id of the model in huggingface.co')
-    parser.add_argument('--concurrency',
+    parser.add_argument('-c',
+                        '--concurrency',
                        nargs='+',
                        type=int,
                        help='how many requests launched concurrently',
                        default=[1, 16, 32, 64])
    parser.add_argument(
+        '-pt',
        '--prompt-tokens',
        nargs='+',
        type=int,
-        help='how many requests launched concurrently. One-to-one'
+        help='how many requests launched concurrently. One-to-one '
        'correspondence with completion-tokens',
        default=[1, 128, 128, 2048, 2048])
-    parser.add_argument('--completion-tokens',
+    parser.add_argument('-ct',
+                        '--completion-tokens',
                        nargs='+',
                        type=int,
                        help='how many tokens to be generated. One-to-one'
                        'correspondence with prompt-tokens',
                        default=[128, 128, 2048, 128, 2048])
-    parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
-    parser.add_argument('--top_k',
-                        type=int,
-                        help='The number of highest probability vocabulary '
-                        'tokens to keep for top-k-filtering',
-                        default=1)
-    parser.add_argument('--top_p',
-                        type=float,
-                        help='the set of most probable tokens with '
-                        'probabilities that add up to top_p or higher '
-                        'are kept for generation',
-                        default=1.0)
-    parser.add_argument('--temperature',
-                        type=float,
-                        help='The value used to modulate the next token '
-                        'probabilities',
-                        default=1.0)
    parser.add_argument('--csv',
                        type=str,
                        help='Where to save the result.',
                        default='profile_generation.csv')
-    parser.add_argument('--log-level',
+    parser.add_argument('-tr',
-                        help='set log level',
+                        '--test-round',
-                        default='ERROR',
-                        choices=list(logging._nameToLevel.keys()))
-    parser.add_argument('--test-round',
                        type=int,
                        help='number of test rounds',
-                        default=6)
+                        default=3)
-    parser.add_argument('--warmup-round',
+    parser.add_argument('-w',
+                        '--warmup-round',
                        type=int,
-                        help='number of warmuop rounds',
+                        help='number of warmup rounds',
                        default=1)
+    # other args
+    ArgumentHelper.top_p(parser)
+    ArgumentHelper.temperature(parser)
+    ArgumentHelper.top_k(parser)
+    ArgumentHelper.log_level(parser)
+    ArgumentHelper.backend(parser)
+    # pytorch engine args
+    pt_group = parser.add_argument_group('PyTorch engine arguments')
+    tp_act = ArgumentHelper.tp(pt_group)
+    cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
+    session_len_act = ArgumentHelper.session_len(pt_group, default=2048)
+    # turbomind engine args
+    tb_group = parser.add_argument_group('TurboMind engine argument')
+    tb_group._group_actions.append(tp_act)
+    tb_group._group_actions.append(session_len_act)
+    tb_group._group_actions.append(cache_count_act)
+    ArgumentHelper.model_format(tb_group, default='hf')
    args = parser.parse_args()
    return args
+def __proc_cb(*args, ret_pipe, target):
+    try:
+        ret = target(*args)
+        ret_pipe[1].send(ret)
+    except Exception as e:
+        ret_pipe[1].send(e)
+def _process_map(target, iterable):
+    from multiprocessing import Pipe, get_context
+    pipe = Pipe(False)
+    spawn_context = get_context('spawn')
+    proc = spawn_context.Process(target=__proc_cb,
+                                 args=iterable,
+                                 kwargs=dict(ret_pipe=pipe, target=target))
+    proc.start()
+    proc.join()
+    ret = pipe[0].recv()
+    if isinstance(ret, Exception):
+        raise ret
+    return ret
 def main():
    args = parse_args()
    assert len(args.prompt_tokens) == len(args.completion_tokens), \
@@ -342,30 +377,49 @@ def main():
    os.environ['TM_LOG_LEVEL'] = args.log_level
    results: List[ProfileResult] = []
    for batch in args.concurrency:
        for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
                                                    args.completion_tokens):
-            # MemoryMonitor.start()
+            MemoryMonitor.start()
            from functools import partial
-            from multiprocessing import Pool
-            profile_target = partial(profile_throughput,
+            # make sure session_len >= prompt_tokens + completion_tokens
-                                     concurrency=batch,
+            session_len = max(args.session_len,
-                                     input_seqlen=prompt_tokens,
+                              prompt_tokens + completion_tokens)
-                                     output_seqlen=completion_tokens,
+            if args.backend == 'turbomind':
+                engine_config = TurbomindEngineConfig(
+                    cache_max_entry_count=args.cache_max_entry_count,
+                    model_format=args.model_format,
+                    session_len=session_len,
+                    tp=args.tp)
+            elif args.backend == 'pytorch':
+                engine_config = PytorchEngineConfig(
+                    cache_max_entry_count=args.cache_max_entry_count,
+                    session_len=session_len,
                    tp=args.tp,
+                    thread_safe=True)
+            gen_config = EngineGenerationConfig(
                top_k=args.top_k,
                top_p=args.top_p,
                temperature=args.temperature,
+                max_new_tokens=completion_tokens,
+                ignore_eos=True)
+            profile_target = partial(
+                profile_throughput,
+                concurrency=batch,
+                input_seqlen=prompt_tokens,
+                engine_config=engine_config,
+                gen_config=gen_config,
                test_round=args.test_round,
-                                     warmup_round=args.warmup_round)
+                warmup_round=args.warmup_round,
-            output = Pool(1).map(profile_target, (args.model_path, ))
+            )
+            output = _process_map(profile_target, (args.model_path, ))
            model_name, first_token_latency, percentiles, \
-                throughput_per_proc, tp = output[0]
+                throughput_per_proc, tp = output
            time.sleep(5)  # wait a while for releasing GPU mem
-            # memory = MemoryMonitor.terminate()
+            memory = MemoryMonitor.terminate()
-            # device_count = MemoryMonitor.device_count.value
+            device_count = MemoryMonitor.device_count.value
-            memory=0
-            device_count=0
            results.append(
                ProfileResult(model_name=model_name,
                              batch=batch,

--- a/benchmark/profile_hf_generation.py
+++ b/benchmark/profile_hf_generation.py
@@ -75,8 +75,9 @@ import torch
 from transformers import AutoModelForCausalLM, GenerationConfig
 from lmdeploy.pytorch.accel import LoadNoInit
+from lmdeploy.utils import get_logger
-logger = logging.getLogger(__file__)
+logger = get_logger(__file__)
 logger.setLevel(logging.DEBUG)
 info = logger.info
 warning = logger.warning

--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -28,6 +28,9 @@ def sample_requests(
    dataset = [(data['conversations'][0]['value'],
                data['conversations'][1]['value']) for data in dataset]
+    # pre-sample to avoid go through all the dataset
+    dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
    # Tokenize the prompts and completions.
    prompts = [prompt for prompt, _ in dataset]
    prompt_token_ids = tokenizer(prompts).input_ids
@@ -204,8 +207,8 @@ class Engine:
 def main(server_addr: str,
         tokenizer_path: str,
         dataset: str,
-         concurrency: int = 64,
+         concurrency: int = 128,
-         num_prompts: int = 2000,
+         num_prompts: int = 5000,
         top_p: float = 1.0,
         temperature: float = 1.0,
         stream_output: bool = False,
@@ -218,8 +221,8 @@ def main(server_addr: str,
        tokenizer_path (str): Path to the tokenizer model in localhost
        dataset (str): Path to the dataset
        concurrency (int, optional): Number of working threads to process the sampled prompts.
-            Defaults to 64.
+            Defaults to 128.
-        num_prompts (int, optional): Number of prompts to process. Defaults to 2000.
+        num_prompts (int, optional): Number of prompts to process. Defaults to 5000.
        top_p (float, optional): the set of most probable tokens with
            probabilities that add up to top_p or higher
            are kept for generation. Defaults to 1.0.

--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -28,6 +28,9 @@ def sample_requests(
    dataset = [(data['conversations'][0]['value'],
                data['conversations'][1]['value']) for data in dataset]
+    # pre-sample to avoid go through all the dataset
+    dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
    # Tokenize the prompts and completions.
    prompts = [prompt for prompt, _ in dataset]
    prompt_token_ids = tokenizer(prompts).input_ids
@@ -80,7 +83,6 @@ class Engine:
        chatbot = Chatbot(self.server_addr,
                          ignore_eos=True,
-                          profile_serving=True,
                          top_k=self.top_k,
                          top_p=self.top_p,
                          temperature=self.temperature,
@@ -150,6 +152,7 @@ class Engine:
            session_id, _stats = res_queue.get()
            # print(f'\n{"-" * 50}\n'
            #       f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
+            if len(_stats) != 0:
                stats.append(np.array(_stats))
        stats = np.concatenate(stats).reshape(-1, 5)

--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import argparse
 import csv
 import json
 import os
@@ -6,14 +7,16 @@ import random
 import time
 from queue import Queue
 from threading import Thread
-from typing import List, Tuple
+from typing import List, Tuple, Union
-import fire
 import numpy as np
 from tqdm import tqdm
-from lmdeploy.tokenizer import Tokenizer
+from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
-from lmdeploy.turbomind import TurboMind
+from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
+                               TurbomindEngineConfig)
+from lmdeploy.pytorch.engine.engine import EngineInstance
+from lmdeploy.tokenizer import DetokenizeState, Tokenizer
 def sample_requests(
@@ -30,6 +33,9 @@ def sample_requests(
    dataset = [(data['conversations'][0]['value'],
                data['conversations'][1]['value']) for data in dataset]
+    # pre-sample to avoid go through all the dataset
+    dataset = random.sample(dataset, max(int(num_requests * 1.2), 1000))
    # Tokenize the prompts and completions.
    prompts = [prompt for prompt, _ in dataset]
    prompt_token_ids = tokenizer(prompts).input_ids
@@ -59,19 +65,25 @@ def sample_requests(
 class Engine:
-    def __init__(self, model_path: str, tp: int, csv: str, **kwargs):
+    def __init__(self, model_path: str,
-        # avoid turbomind checking chat template name by setting
+                 engine_config: Union[PytorchEngineConfig,
-        # `model_name='llama'`
+                                      TurbomindEngineConfig], csv: str):
-        tm_model = TurboMind(model_path=model_path,
+        if isinstance(engine_config, TurbomindEngineConfig):
-                             model_name='llama',
+            from lmdeploy.turbomind import TurboMind
-                             tp=tp,
+            tm_model = TurboMind.from_pretrained(model_path,
-                             **kwargs)
+                                                 engine_config=engine_config)
+        elif isinstance(engine_config, PytorchEngineConfig):
+            from lmdeploy.pytorch.engine import Engine as PytorchEngine
+            tm_model = PytorchEngine(model_path, engine_config=engine_config)
        self.tm_model = tm_model
        self.tokenizer = tm_model.tokenizer
        self.csv = csv
        self.pbar = None
    def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
+                   temperature: float, top_p: float, top_k: int,
                   stream_output: bool):
        model_inst = self.tm_model.create_instance()
        stats = []
@@ -80,31 +92,35 @@ class Engine:
        for prompt, input_seqlen, output_seqlen in iter(
                req_queue.get, [None, None, None]):
            _per_token_latency_stats = [0] * (output_seqlen + 1)
-            offset = 0
+            state = DetokenizeState()
            prev = time.perf_counter()
            n_prev_token = 0
            input_ids = self.tokenizer(prompt).input_ids
            for outputs in model_inst.stream_infer(
                    session_id,
                    input_ids=input_ids,
-                    request_output_len=output_seqlen,
+                    gen_config=EngineGenerationConfig(
-                    temperature=1.0,
+                        max_new_tokens=output_seqlen,
-                    top_p=1.0,
+                        temperature=temperature,
+                        top_p=top_p,
+                        top_k=top_k,
+                        ignore_eos=True),
                    sequence_start=True,
                    sequence_end=True,
-                    ignore_eos=True,
                    stream_output=stream_output):
-                res, n_token = outputs[0]
+                _, res, n_token = outputs
-                self.tokenizer.decode(res, offset)
+                _, state = self.tokenizer.detokenize_incrementally(res, state)
-                offset = n_token
                now = time.perf_counter()
                if n_prev_token != n_token:
                    _per_token_latency_stats[n_prev_token] = np.round(
                        now - prev, 3)
                    n_prev_token = n_token
                prev = now
+            # for pytorch engine to restart a session
+            if isinstance(model_inst, EngineInstance):
+                model_inst.end(session_id)
            assert output_seqlen <= n_token <= output_seqlen + 1, \
                f'Error. session_id({session_id}) request {output_seqlen} ' \
                f'tokens, but generate {n_token} tokens.\n' \
@@ -122,10 +138,8 @@ class Engine:
            self.pbar.update(1)
        res_queue.put((session_id, stats, per_token_latency_stats))
-    def process_request(self,
+    def process_request(self, requests, concurrency, temperature, top_p, top_k,
-                        requests,
+                        stream_output):
-                        concurrency: int = 1,
-                        stream_output: bool = True):
        res_queue = Queue()
        req_queue = Queue()
        threads = []
@@ -143,7 +157,9 @@ class Engine:
        # start threads
        for i in range(concurrency):
            t = Thread(target=self._inference,
-                       args=(req_queue, res_queue, i, stream_output))
+                       args=(req_queue, res_queue, i, temperature, top_p,
+                             top_k, stream_output),
+                       daemon=True)
            t.start()
            threads.append(t)
@@ -225,53 +241,90 @@ class Engine:
                ])
-def main(dataset: str,
+def parse_args():
-         model_path: str,
+    parser = argparse.ArgumentParser(
-         concurrency: int = 64,
+        description='Benchmark the request throughput of lmdeploy '
-         num_prompts: int = 2000,
+        'in localhost',
-         tp: int = 1,
+        formatter_class=DefaultsAndTypesHelpFormatter)
-         top_k: int = 1,
+    parser.add_argument('dataset', type=str, help='the path dataset')
-         top_p: float = 1.0,
+    parser.add_argument('model_path',
-         temperature: float = 1.0,
+                        type=str,
-         stream_output: bool = True,
+                        help='the path of the model in localhost or '
-         csv: str = './profile_throughput.csv',
+                        'the repo_id of the model in huggingface.co')
-         log_level: str = 'ERROR',
+    parser.add_argument(
-         seed: int = 0):
+        '-c',
-    """Benchmark the request throughput of lmdeploy in localhost.
+        '--concurrency',
+        type=int,
-    Args:
+        help='Number of working threads to process the sampled prompts',
-        dataset (str): Path to the dataset
+        default=256)
-        model_path (str): Path to a model in localhost or a model_repo_id in huggingface.co
+    parser.add_argument('-n',
-        concurrency (int, optional): Number of working threads to process the sampled prompts.
+                        '--num-prompts',
-            Defaults to 64.
+                        type=int,
-        num_prompts (int, optional): Number of prompts to process. Defaults to 2000.
+                        help='Number of prompts to process',
-        tp (int, optional): Number of GPUs for tensor parallel. Defaults to 1.
+                        default=5000)
-        top_k (int, optional): The number of highest probability vocabulary tokens
+    parser.add_argument('--csv',
-            to keep for top-k-filtering. Defaults to 1.
+                        type=str,
-        top_p (float, optional): the set of most probable tokens with
+                        help='Where to save the result.',
-            probabilities that add up to top_p or higher
+                        default='./profile_throughput.csv')
-            are kept for generation. Defaults to 1.0.
+    parser.add_argument('--seed',
-        temperature (float, optional): The value used to modulate the next token probabilities.
+                        type=int,
-            Defaults to 1.0.
+                        default=0,
-        stream_output (bool, optional): Indicator for streaming output. Defaults to True.
+                        help='Seed used in sampling prompts from dataset')
-        csv (str, optional): The path to save the result.
+    # other args
-        log_level(str, optional): The log level. Defaults to INFO
+    ArgumentHelper.top_p(parser)
-        seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
+    ArgumentHelper.temperature(parser)
-    """    # noqa
+    ArgumentHelper.top_k(parser)
-    random.seed(seed)
+    ArgumentHelper.log_level(parser)
-    os.environ['TM_LOG_LEVEL'] = log_level
+    ArgumentHelper.backend(parser)
-    engine = Engine(model_path,
+    # pytorch engine args
-                    tp=tp,
+    pt_group = parser.add_argument_group('PyTorch engine arguments')
-                    top_k=top_k,
+    tp_act = ArgumentHelper.tp(pt_group)
-                    top_p=top_p,
+    session_len_act = ArgumentHelper.session_len(pt_group, default=4096)
-                    temperature=temperature,
+    cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
-                    csv=csv)
+    # turbomind engine args
-    requests = sample_requests(dataset, num_prompts, engine.tokenizer)
+    tb_group = parser.add_argument_group('TurboMind engine argument')
+    tb_group._group_actions.append(tp_act)
-    engine.process_request(requests, concurrency, stream_output)
+    tb_group._group_actions.append(session_len_act)
+    tb_group._group_actions.append(cache_count_act)
+    ArgumentHelper.model_format(tb_group, default='hf')
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    random.seed(args.seed)
+    os.environ['TM_LOG_LEVEL'] = args.log_level
+    if args.backend == 'turbomind':
+        engine_config = TurbomindEngineConfig(
+            session_len=args.session_len,
+            max_batch_size=args.concurrency,
+            tp=args.tp,
+            cache_max_entry_count=args.cache_max_entry_count,
+            model_format=args.model_format)
+    elif args.backend == 'pytorch':
+        engine_config = PytorchEngineConfig(
+            session_len=args.session_len,
+            cache_max_entry_count=args.cache_max_entry_count,
+            max_batch_size=args.concurrency,
+            tp=args.tp,
+            thread_safe=True)
+    engine = Engine(args.model_path, engine_config, csv=args.csv)
+    requests = sample_requests(args.dataset, args.num_prompts,
+                               engine.tokenizer)
+    engine.process_request(requests,
+                           temperature=args.temperature,
+                           top_p=args.top_p,
+                           top_k=args.top_k,
+                           concurrency=args.concurrency,
+                           stream_output=True)
 if __name__ == '__main__':
-    fire.Fire(main)
+    main()
--- a/builder/windows/generate.ps1
+++ b/builder/windows/generate.ps1
-cmake .. -A x64 -T v142,cuda="$env:CUDA_PATH" `
+cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
    -DCMAKE_BUILD_TYPE=Release `
    -DCMAKE_INSTALL_PREFIX=install `
    -DBUILD_PY_FFI=ON `

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
 FROM nvcr.io/nvidia/tritonserver:22.12-py3
 RUN rm /etc/apt/sources.list.d/cuda*.list && apt-get update && apt-get install -y --no-install-recommends \
-    rapidjson-dev libgoogle-glog-dev gdb  \
+    rapidjson-dev libgoogle-glog-dev gdb python3.8-venv \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py38
-RUN python3 -m pip install --no-cache-dir torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
+ENV PATH=/opt/py38/bin:$PATH
-RUN python3 -m pip install --no-cache-dir cmake packaging
+RUN python3 -m pip install --no-cache-dir --upgrade pip &&\
+    python3 -m pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118 &&\
+    python3 -m pip install --no-cache-dir cmake packaging wheel
 ENV NCCL_LAUNCH_MODE=GROUP
@@ -29,7 +32,7 @@ RUN cd /opt/lmdeploy &&\
        -DUSE_NVTX=ON &&\
    make -j$(nproc) && make install &&\
    cd .. &&\
-    python3 -m pip install . &&\
+    python3 -m pip install -e . &&\
    rm -rf build
 ENV LD_LIBRARY_PATH=/opt/tritonserver/lib:$LD_LIBRARY_PATH
--- a/docs/en/_static/css/readthedocs.css
+++ b/docs/en/_static/css/readthedocs.css
 .header-logo {
-    background-image: url("../image/lmdeploy-logo.png");
+    background-image: url("../image/lmdeploy-logo.svg");
-    background-size: 150px 60px;
+    background-size: 257px 60px;
    height: 60px;
-    width: 150px;
+    width: 257px;
+}
+@media screen and (min-width: 1100px) {
+  .header-logo {
+    top: -15px;
+  }
+}
+pre {
+    white-space: pre;
+}
+@media screen and (min-width: 2000px) {
+  .pytorch-content-left {
+    width: 1200px;
+    margin-left: 30px;
+  }
+  article.pytorch-article {
+    max-width: 1200px;
+  }
+  .pytorch-breadcrumbs-wrapper {
+    width: 1200px;
+  }
+  .pytorch-right-menu.scrolling-fixed {
+    position: fixed;
+    top: 45px;
+    left: 1580px;
+  }
+}
+article.pytorch-article section code {
+  padding: .2em .4em;
+  background-color: #f3f4f7;
+  border-radius: 5px;
+}
+/* Disable the change in tables */
+article.pytorch-article section table code {
+  padding: unset;
+  background-color: unset;
+  border-radius: unset;
+}
+table.autosummary td {
+  width: 50%
+}
+img.align-center {
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+}
+article.pytorch-article p.rubric {
+  font-weight: bold;
 }
--- a/docs/en/_static/image/lmdeploy-logo.png
+++ b/docs/en/_static/image/lmdeploy-logo.png
-resources/lmdeploy-logo.png
\ No newline at end of file
--- a/docs/en/api.rst
+++ b/docs/en/api.rst
-lmdeploy.lite
-------------
-.. automodule:: lmdeploy.lite
-    :members:
-lmdeploy.pytorch
----------------
-.. automodule:: lmdeploy.pytorch
-    :members:
-lmdeploy.serve
--------------
-.. automodule:: lmdeploy.serve
-    :members: