Add more user-friendly CLI (#541)

* add * import fire in main * wrap to speed up fire cli * update * update docs * update docs * fix * resolve commennts * resolve confict and add test for cli

Add more user-friendly CLI (#541)
* add * import fire in main * wrap to speed up fire cli * update * update docs * update docs * fix * resolve commennts * resolve confict and add test for cli
169d5169 · RunningLeon · GitHub · 7283781e · 169d5169 · 169d5169
Unverified Commit 169d5169 authored Oct 25, 2023 by RunningLeon Committed by GitHub Oct 25, 2023
20 changed files
--- a/README.md
+++ b/README.md
@@ -119,14 +119,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 # 2. Convert InternLM model to turbomind's format, which will be in "./workspace" by default
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 #### Inference by TurboMind
 ```shell
-python -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 > **Note**<br />
@@ -140,7 +140,7 @@ python -m lmdeploy.turbomind.chat ./workspace
 #### Serving with gradio
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 Launch inference server by:
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --instance_num 32 --tp 1
 ```
 Then, you can communicate with it by command line,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 or webui,
@@ -165,8 +165,8 @@ or webui,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 Refer to [restful_api.md](docs/en/restful_api.md) for more details.
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
 Then, you can communicate with the inference server by command line,
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 or webui,
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 For the deployment of other supported models, such as LLaMA, LLaMA-2, vicuna and so on, you can find the guide from [here](docs/en/serving.md)
@@ -200,7 +200,7 @@ For detailed instructions on Inference pytorch models, see [here](docs/en/pytorc
 #### Single GPU
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL \
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL \
    --max_new_tokens 64 \
    --temperture 0.8 \
    --top_p 0.95 \

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -120,14 +120,14 @@ git clone https://huggingface.co/internlm/internlm-chat-7b-v1_1 /path/to/internl
 GIT_LFS_SKIP_SMUDGE=1
 # 2. 转换为 trubomind 要求的格式。默认存放路径为 ./workspace
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 #### 使用 turbomind 推理
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 > **Note**<br />
@@ -140,7 +140,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 #### 启动 gradio server
 ```shell
-python3 -m lmdeploy.serve.gradio.app ./workspace
+lmdeploy serve gradio ./workspace
 ```
 ![](https://github.com/InternLM/lmdeploy/assets/67539920/08d1e6f2-3767-44d5-8654-c85767cec2ab)
@@ -150,14 +150,14 @@ python3 -m lmdeploy.serve.gradio.app ./workspace
 使用下面的命令启动推理服务：
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 你可以通过命令行方式与推理服务进行对话：
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 也可以通过 WebUI 方式来对话：
@@ -165,8 +165,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port${server_port} --restful_api True
 ```
 更多详情可以查阅 [restful_api.md](docs/zh_cn/restful_api.md)。
@@ -182,13 +182,13 @@ bash workspace/service_docker_up.sh
 你可以通过命令行方式与推理服务进行对话：
 ```shell
-python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+lmdeploy serve triton_client {server_ip_addresss}:33337
 ```
 也可以通过 WebUI 方式来对话：
 ```shell
-python3 -m lmdeploy.serve.gradio.app {server_ip_addresss}:33337
+lmdeploy serve gradio {server_ip_addresss}:33337
 ```
 其他模型的部署方式，比如 LLaMA，LLaMA-2，vicuna等等，请参考[这里](docs/zh_cn/serving.md)
@@ -204,7 +204,7 @@ pip install deepspeed
 #### 单个 GPU
 ```shell
-python3 -m lmdeploy.pytorch.chat $NAME_OR_PATH_TO_HF_MODEL\
+lmdeploy chat torch $NAME_OR_PATH_TO_HF_MODEL\
    --max_new_tokens 64 \
    --temperture 0.8 \
    --top_p 0.95 \

--- a/docs/en/kv_int8.md
+++ b/docs/en/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 Convert the Hugging Face model format to the TurboMind inference format to create a workspace directory.
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 If you already have a workspace directory, skip this step.
@@ -29,7 +29,7 @@ Get the quantization parameters by these two steps:
 ```bash
 # get minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
  --model $HF_MODEL \
  --calib_dataset 'c4' \             # Support c4, ptb, wikitext2, pileval
  --calib_samples 128 \              # Number of samples in the calibration set, if the memory is not enough, it can be adjusted appropriately
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
  --work_dir $WORK_DIR \             # Directory for saving quantized statistical parameters and quantized weights in Pytorch format
 # get quant parameters
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
  --work_dir $WORK_DIR  \                             # Directory of the last output
  --turbomind_dir workspace/triton_models/weights/ \ # Directory to save the quantization parameters
  --kv_sym False \                                    # Symmetric or asymmetric quantization, default is False
@@ -64,7 +64,7 @@ Considering there are four combinations of kernels needed to be implemented, pre
 Test the chat performance.
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 ## GPU Memory Test

--- a/docs/en/pytorch.md
+++ b/docs/en/pytorch.md
@@ -9,13 +9,13 @@ This submodule allow user to chat with language model through command line, and
 **Example 1**: Chat with default setting
 ```shell
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
+lmdeploy chat torch $PATH_TO_HF_MODEL
 ```
 **Example 2**: Disable sampling and chat history
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
    --temperature 0 --max-history 0
 ```
@@ -23,7 +23,7 @@ python -m lmdeploy.pytorch.chat \
 **Example 3**: Accelerate with deepspeed inference
 ```shell
-python -m lmdeploy.pytorch.chat \
+lmdeploy chat torch \
    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
    --accel deepspeed
 ```

--- a/docs/en/restful_api.md
+++ b/docs/en/restful_api.md
@@ -3,7 +3,7 @@
 ### Launch Service
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 Then, the user can open the swagger UI: `http://{server_ip}:{server_port}` for the detailed api usage.
@@ -125,7 +125,7 @@ There is a client script for restful api server.
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 ### webui
@@ -135,8 +135,8 @@ You can also test restful-api through webui.
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 ### FAQ

--- a/docs/en/serving.md
+++ b/docs/en/serving.md
@@ -8,7 +8,7 @@ You can download [llama-2 models from huggingface](https://huggingface.co/meta-l
 <summary><b>7B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -42,7 +42,7 @@ Weights for the LLaMA models can be obtained from by filling out [this form](htt
 <summary><b>7B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
    --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-7b \
  --delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-13b \
  --delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```

--- a/docs/en/supported_models/codellama.md
+++ b/docs/en/supported_models/codellama.md
@@ -29,7 +29,7 @@ Based on the above table, download the model that meets your requirements. Execu
 python3 -m pip install lmdeploy
 # convert weight layout
-python3 -m lmdeploy.serve.turbomind.deploy codellama /the/path/of/codellama/model
+lmdeploy convert codellama /the/path/of/codellama/model
 ```
 Then, you can communicate with codellama in consolo by following instructions in next sections
@@ -42,13 +42,13 @@ Then, you can communicate with codellama in consolo by following instructions in
 ### Completion
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 ### Infilling
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 The input code is supposed to have a special placeholder `<FILL>`. For example,
@@ -64,7 +64,7 @@ And the generated code piece by `turbomind.chat` is the one to be filled in `<FI
 ### Chat
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 `--sys-instruct` instruction can be changed to other coding languages as long as codellama supports it
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python specialist
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 Python fine-tuned model is highly recommended when 'python specialist' capability is required.
@@ -90,14 +90,14 @@ Launch inference server by:
 ```shell
 # --instance_num: number of instances to performance inference, which can be viewed as max requests concurrency
 # --tp: the number of GPUs used in tensor parallelism
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name ${server_ip} --server_port ${server_port} --instance_num 32 --tp 1
 ```
 Then, you can communicate with it by command line,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 or through webui after launching gradio,
@@ -105,8 +105,8 @@ or through webui after launching gradio,
 ```shell
 # restful_api_url is what printed in api_server.py, e.g. http://localhost:23333
 # server_ip and server_port here are for gradio ui
-# example: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# example: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 Regarding the detailed information of RESTful API, you can refer to [restful_api.md](../restful_api.md).
--- a/docs/en/w4a16.md
+++ b/docs/en/w4a16.md
@@ -26,14 +26,14 @@ As demonstrated in the command below, first convert the model's layout using `tu
 ```shell
 ## Convert the model's layout and store it in the default path, ./workspace.
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
    --model-name llama2 \
    --model-path ./llama2-chat-7b-w4 \
    --model-format awq \
    --group-size 128
 ## inference
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 ## Serve with gradio
@@ -41,7 +41,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 If you wish to interact with the model via web ui, please initiate the gradio server as indicated below:
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 Subsequently, you can open the website `http://{ip_addr}:{port}` in your browser and interact with the model
@@ -82,7 +82,7 @@ It includes two steps:
 ### Step 1: Generate Quantization Parameter
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
  --model $HF_MODEL \
  --calib_dataset 'c4' \             # Calibration dataset, supports c4, ptb, wikitext2, pileval
  --calib_samples 128 \              # Number of samples in the calibration set, if memory is insufficient, you can appropriately reduce this
@@ -95,7 +95,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy employs AWQ algorithm for model weight quantization.
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
  --model $HF_MODEL \
  --w_bits 4 \                       # Bit number for weight quantization
  --w_group_size 128 \               # Group size for weight quantization statistics

--- a/docs/zh_cn/kv_int8.md
+++ b/docs/zh_cn/kv_int8.md
@@ -18,7 +18,7 @@ dequant: f = q * scale + zp
 把 huggingface 格式的模型，转成 turbomind 推理格式，得到一个 workspace 目录
 ```bash
-python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b
+lmdeploy convert internlm-chat-7b /path/to/internlm-chat-7b
 ```
 如果已经有 workspace 目录，可以跳过这步。
@@ -29,7 +29,7 @@ python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-ch
 ```bash
 # 计算 minmax
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
  --model $HF_MODEL \
  --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
  --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -37,7 +37,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
  --work_dir $WORK_DIR \             # 保存 Pytorch 格式量化统计参数和量化后权重的文件夹
 # 通过 minmax 获取量化参数
-python3 -m lmdeploy.lite.apis.kv_qparams \
+lmdeploy lite kv_qparams \
  --work_dir $WORK_DIR  \                             # 上一步的结果
  --turbomind_dir workspace/triton_models/weights/ \ # 保存量化参数的目录，推理要用
  --kv_sym False \                                    # 对称量化或非对称量化，默认为 False
@@ -64,7 +64,7 @@ python3 -m lmdeploy.lite.apis.kv_qparams \
 测试聊天效果
 ```bash
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 ## 显存测试

--- a/docs/zh_cn/restful_api.md
+++ b/docs/zh_cn/restful_api.md
@@ -5,7 +5,7 @@
 运行脚本
 ```shell
-python3 -m lmdeploy.serve.openai.api_server ./workspace 0.0.0.0 server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 然后用户可以打开 swagger UI: `http://{server_ip}:{server_port}` 详细查看所有的 API 及其使用方法。
@@ -127,7 +127,7 @@ restful api 服务可以通过客户端测试，例如
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 ### webui
@@ -137,8 +137,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 ### FAQ

--- a/docs/zh_cn/serving.md
+++ b/docs/zh_cn/serving.md
@@ -8,7 +8,7 @@
 <summary><b>7B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-7b-chat-hf
+lmdeploy convert llama2 /path/to/llama-2-7b-chat-hf
 bash workspace/service_docker_up.sh
 ```
@@ -18,7 +18,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-13b-chat-hf --tp 2
+lmdeploy convert llama2 /path/to/llama-2-13b-chat-hf --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -28,7 +28,7 @@ bash workspace/service_docker_up.sh
 <summary><b>70B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama2 /path/to/llama-2-70b-chat-hf --tp 8
+lmdeploy convert llama2 /path/to/llama-2-70b-chat-hf --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -42,7 +42,7 @@ bash workspace/service_docker_up.sh
 <summary><b>7B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-7b llama \
+lmdeploy convert llama /path/to/llama-7b llama \
    --tokenizer_path /path/to/tokenizer/model
 bash workspace/service_docker_up.sh
 ```
@@ -53,7 +53,7 @@ bash workspace/service_docker_up.sh
 <summary><b>13B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-13b llama \
+lmdeploy convert llama /path/to/llama-13b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 2
 bash workspace/service_docker_up.sh
 ```
@@ -64,7 +64,7 @@ bash workspace/service_docker_up.sh
 <summary><b>30B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-30b llama \
+lmdeploy convert llama /path/to/llama-30b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 4
 bash workspace/service_docker_up.sh
 ```
@@ -75,7 +75,7 @@ bash workspace/service_docker_up.sh
 <summary><b>65B</b></summary>
 ```shell
-python3 -m lmdeploy.serve.turbomind.deploy llama /path/to/llama-65b llama \
+lmdeploy convert llama /path/to/llama-65b llama \
    --tokenizer_path /path/to/tokenizer/model --tp 8
 bash workspace/service_docker_up.sh
 ```
@@ -94,7 +94,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-7b \
  --delta-path lmsys/vicuna-7b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-7b
+lmdeploy convert vicuna /path/to/vicuna-7b
 bash workspace/service_docker_up.sh
 ```
@@ -110,7 +110,7 @@ python3 -m fastchat.model.apply_delta \
  --target-model-path /path/to/vicuna-13b \
  --delta-path lmsys/vicuna-13b-delta-v1.1
-python3 -m lmdeploy.serve.turbomind.deploy vicuna /path/to/vicuna-13b
+lmdeploy convert vicuna /path/to/vicuna-13b
 bash workspace/service_docker_up.sh
 ```

--- a/docs/zh_cn/supported_models/codellama.md
+++ b/docs/zh_cn/supported_models/codellama.md
@@ -29,7 +29,7 @@
 python3 -m pip install lmdeploy
 # 转模型格式
-python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
+lmdeploy convert codellama /path/of/codellama/model
 ```
 接下来，可参考如下章节，在控制台与 codellama 进行交互式对话。
@@ -42,13 +42,13 @@ python3 -m lmdeploy.serve.turbomind.deploy codellama /path/of/codellama/model
 ### 代码续写
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap completion
+lmdeploy chat turbomind ./workspace --cap completion
 ```
 ### 代码填空
 ```shell
-python3 -m lmdeploy.turbomind.chat ./workspace --cap infilling
+lmdeploy chat turbomind ./workspace --cap infilling
 ```
 输入的代码块中要包含 `<FILL>`，比如：
@@ -64,7 +64,7 @@ def remove_non_ascii(s: str) -> str:
 ### 对话
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provide answers in Python"
+lmdeploy chat turbomind ./workspace --cap chat --sys-instruct "Provide answers in Python"
 ```
 可以把 `--sys-instruct` 的指令换成 codellama 支持的其他变成语言。
@@ -72,7 +72,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace --cap chat --sys-instruct "Provid
 ### Python 专项
 ```
-python3 -m lmdeploy.turbomind.chat ./workspace --cap python
+lmdeploy chat turbomind ./workspace --cap python
 ```
 建议这里部署 Python 微调模型
@@ -90,7 +90,7 @@ TBD
 ```shell
 # --instance_num: turbomind推理实例的个数。可理解为支持的最大并发数
 # --tp: 在 tensor parallel时，使用的GPU数量
-python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --instance_num 32 --tp 1
+lmdeploy serve api_server ./workspace --server_name 0.0.0.0 --server_port ${server_port} --instance_num 32 --tp 1
 ```
 打开 `http://{server_ip}:{server_port}`，即可访问 swagger，查阅 RESTful API 的详细信息。
@@ -99,7 +99,7 @@ python3 -m lmdeploy.serve.openai.api_server ./workspace server_ip server_port --
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
-python -m lmdeploy.serve.openai.api_client restful_api_url
+lmdeploy serve api_client restful_api_url
 ```
 或者，启动 gradio，在 webui 的聊天对话框中，与 codellama 交流：
@@ -107,8 +107,8 @@ python -m lmdeploy.serve.openai.api_client restful_api_url
 ```shell
 # restful_api_url 就是 api_server 产生的，比如 http://localhost:23333
 # server_ip 和 server_port 是用来提供 gradio ui 访问服务的
-# 例子: python -m lmdeploy.serve.gradio.app http://localhost:23333 localhost 6006 --restful_api True
+# 例子: lmdeploy serve gradio http://localhost:23333 --server_name localhost --server_port 6006 --restful_api True
-python -m lmdeploy.serve.gradio.app restful_api_url server_ip --restful_api True
+lmdeploy serve gradio restful_api_url --server_name ${server_ip} --server_port ${server_port} --restful_api True
 ```
 关于 RESTful API的详细介绍，请参考[这份](../restful_api.md)文档。
--- a/docs/zh_cn/w4a16.md
+++ b/docs/zh_cn/w4a16.md
@@ -24,14 +24,14 @@ git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
 ```shell
 ## 转换模型的layout，存放在默认路径 ./workspace 下
-python3 -m lmdeploy.serve.turbomind.deploy \
+lmdeploy convert \
    --model-name llama2 \
    --model-path ./llama2-chat-7b-w4 \
    --model-format awq \
    --group-size 128
 ## 推理
-python3 -m lmdeploy.turbomind.chat ./workspace
+lmdeploy chat turbomind ./workspace
 ```
 ## 启动 gradio 服务
@@ -39,7 +39,7 @@ python3 -m lmdeploy.turbomind.chat ./workspace
 如果想通过 webui 与模型对话，请执行以下命令启动 gradio 服务
 ```shell
-python3 -m lmdeploy.serve.turbomind ./workspace --server_name {ip_addr} ----server_port {port}
+lmdeploy serve gradio ./workspace --server_name {ip_addr} --server_port {port}
 ```
 然后，在浏览器中打开 http://{ip_addr}:{port}，即可在线对话
@@ -80,7 +80,7 @@ python benchmark/profile_generation.py \
 ### 第一步：生成量化参数
 ```shell
-python3 -m lmdeploy.lite.apis.calibrate \
+lmdeploy lite calibrate \
  --model $HF_MODEL \
  --calib_dataset 'c4' \             # 校准数据集，支持 c4, ptb, wikitext2, pileval
  --calib_samples 128 \              # 校准集的样本数，如果显存不够，可以适当调小
@@ -93,7 +93,7 @@ python3 -m lmdeploy.lite.apis.calibrate \
 LMDeploy 使用 AWQ 算法对模型权重进行量化。在执行下面的命令时，需要把步骤1的`$WORK_DIR`传入。量化结束后，权重文件也会存放在这个目录中。然后就可以根据 ["4bit权重模型推理"](#4bit-权重模型推理)章节的说明，进行模型推理。
 ```shell
-python3 -m lmdeploy.lite.apis.auto_awq \
+lmdeploy lite auto_awq \
  --model $HF_MODEL \
  --w_bits 4 \                       # 权重量化的 bit 数
  --w_group_size 128 \               # 权重量化分组统计尺寸

--- a/lmdeploy/cli/__init__.py
+++ b/lmdeploy/cli/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cli import run
+__all__ = ['run']
--- a/lmdeploy/cli/chat.py
+++ b/lmdeploy/cli/chat.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+class SubCliChat(object):
+    """Chat through terminal with pytorch or turbomind model."""
+    def torch(self,
+              model_path: str,
+              tokenizer_path: Optional[str] = None,
+              accel: Optional[str] = None,
+              max_new_tokens: int = 128,
+              temperature: float = 0.8,
+              top_p: float = 0.95,
+              seed: int = 0,
+              use_fast_tokenizer: bool = True,
+              max_alloc: int = 2048,
+              max_session_len: int = None,
+              log_file: Optional[str] = None,
+              debug: bool = False,
+              adapter: Optional[str] = None):
+        """Chat with pytorch model through terminal.
+        Args:
+            model_path (str): Path to pytorch model.
+            tokenizer_path (str): Path to tokenizer.
+            accel (str): Model accelerator.
+            max_new_tokens (int): Maximum number of tokens to generate.
+            temperature (float): Temperature for sampling.
+            top_p (float): Top p for sampling.
+            seed (int): Random seed.
+            use_fast_tokenizer (bool): Whether to use fast tokenizer.
+                This argument is directly pass to transformer's
+                ``AutoTokenizer.from_pretrained``.
+                Generally, user should choose to use fast tokenizers.
+                But if using fast raise some error, try to force using a slow one.
+            max_alloc (int): Maximum memory to allocate (for deepspeed).
+            max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+                This include both history and current session.
+            log_file (str): Path to log file.
+            debug (bool): Whether to enable debug mode.
+            adapter (str): Force to use an adapter.
+                Generally user should not use this argument because adapter is selected based
+                on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+                based on `LlamaforCausalLM` class, this argument is required.
+                Currently, only "llama1" is acceptable for llama1 models.
+        """  # noqa: E501
+        from lmdeploy.pytorch.chat import main as run_torch_model
+        run_torch_model(model_path,
+                        tokenizer_path=tokenizer_path,
+                        accel=accel,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        seed=seed,
+                        use_fast_tokenizer=use_fast_tokenizer,
+                        max_alloc=max_alloc,
+                        max_session_len=max_session_len,
+                        log_file=log_file,
+                        debug=debug,
+                        adapter=adapter)
+    def turbomind(self,
+                  model_path,
+                  session_id: int = 1,
+                  cap: str = 'chat',
+                  tp=1,
+                  stream_output=True,
+                  **kwargs):
+        """Chat with turbomind model through terminal.
+        Args:
+            model_path (str): the path of the deployed model
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama has
+                the ability among ['completion', 'infilling', 'chat', 'python']
+            tp (int): GPU number used in tensor parallelism
+            stream_output (bool): indicator for streaming output or not
+            **kwarg (dict): other arguments for initializing model's chat
+                template
+        """
+        from lmdeploy.turbomind.chat import main as run_turbomind_model
+        run_turbomind_model(model_path,
+                            session_id=session_id,
+                            cap=cap,
+                            tp=tp,
+                            stream_output=stream_output,
+                            **kwargs)
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import fire
+from .chat import SubCliChat
+from .lite import SubCliLite
+from .serve import SubCliServe
+class CLI(object):
+    """LMDeploy Command Line Interface.
+    The CLI provides a unified API for converting, compressing and deploying
+    large language models.
+    """
+    def convert(self,
+                model_name: str,
+                model_path: str,
+                model_format: str = None,
+                tokenizer_path: str = None,
+                dst_path: str = './workspace',
+                tp: int = 1,
+                quant_path: str = None,
+                group_size: int = 0):
+        """Convert LLMs to lmdeploy format.
+        Args:
+            model_name (str): The name of the to-be-deployed model, such as
+                llama-7b, llama-13b, vicuna-7b and etc.
+            model_path (str): The directory path of the model
+            model_format (str): The format of the model, fb or hf. 'fb' stands
+                for META's llama format, and 'hf' means huggingface format.
+            tokenizer_path (str): The path of tokenizer model.
+            dst_path (str): The destination path that saves outputs.
+            tp (int): The number of GPUs used for tensor parallelism, which
+                should be 2^n.
+            quant_path (str): Path of the quantized model, which can be None.
+            group_size (int): A parameter used in AWQ to quantize fp16 weights
+                to 4 bits.
+        """
+        from lmdeploy.serve.turbomind.deploy import main as convert
+        convert(model_name,
+                model_path,
+                model_format=model_format,
+                tokenizer_path=tokenizer_path,
+                dst_path=dst_path,
+                tp=tp,
+                quant_path=quant_path,
+                group_size=group_size)
+def run():
+    """The entry point of running LMDeploy CLI."""
+    cli = CLI()
+    cli.lite = SubCliLite()
+    cli.chat = SubCliChat()
+    cli.serve = SubCliServe()
+    fire.Fire(cli, name='lmdeploy')
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
+# Copyright (c) OpenMMLab. All rights reserved.
+class SubCliLite(object):
+    """CLI for compressing LLMs."""
+    def auto_awq(self,
+                 model: str,
+                 work_dir: str,
+                 w_bits: int = 4,
+                 w_sym: bool = False,
+                 w_group_size: int = 128,
+                 device: str = 'cuda'):
+        """Perform weight quantization using AWQ algorithm.
+        Args:
+            model (str): The path of model in hf format.
+            work_dir (str): The working directory to save results.
+            w_bits (int): Bit number for weight quantization.
+            w_sym (bool): Whether to do symmetric quantization.
+            w_group_size (int): Group size for weight quantization statistics.
+            device (str): Device type of running.
+        """
+        from lmdeploy.lite.apis.auto_awq import auto_awq
+        auto_awq(model,
+                 work_dir,
+                 w_bits=w_bits,
+                 w_sym=w_sym,
+                 w_group_size=w_group_size,
+                 device=device)
+    def calibrate(self,
+                  model: str,
+                  calib_dataset: str = 'c4',
+                  calib_samples: int = 128,
+                  calib_seqlen: int = 2048,
+                  work_dir: str = './work_dir',
+                  device: str = 'cuda') -> None:
+        """Perform calibration on a given dataset.
+        Args:
+            model (str): The model to be loaded.
+            calib_dataset (str, optional): The calibration dataset name.
+                Defaults to 'c4'.
+            calib_samples (int, optional): The number of samples for
+                calibration. Defaults to 128.
+            calib_seqlen (int, optional): The sequence length for calibration.
+                Defaults to 2048.
+            work_dir (str): The working directory for outputs.
+                Defaults to './work_dir'.
+            device (str, optional): The device to be used for calculation.
+                Defaults to 'cuda'.
+        """
+        from lmdeploy.lite.apis.calibrate import calibrate
+        calibrate(model,
+                  calib_dataset=calib_dataset,
+                  calib_samples=calib_samples,
+                  calib_seqlen=calib_seqlen,
+                  work_dir=work_dir,
+                  device=device)
+    def kv_qparams(self,
+                   work_dir: str,
+                   turbomind_dir: str,
+                   kv_bits: int = 8,
+                   kv_sym: bool = False,
+                   num_tp: int = 1) -> None:
+        """Export key and value stats.
+        Args:
+            work_dir (str): Directory path where the stats
+                are saved.
+            turbomind_dir (str): Directory path where to
+                save the results.
+            kv_bits (int, optional): Number of bits for quantization.
+                Defaults to 8.
+            kv_sym (bool, optional): Whether to use symmetric quantization.
+                Defaults to False.
+            num_tp (int, optional): Number of tensor parallelism.
+                Defaults to 1.
+        """
+        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+        run_kv_qparams(work_dir,
+                       turbomind_dir,
+                       kv_bits=kv_bits,
+                       kv_sym=kv_sym,
+                       num_tp=num_tp)
+    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
+        """Convert a hugging face model to the smallest sharded one.
+        Args:
+            src_dir (str): The directory of the input HF model.
+            dst_dir (str): The directory to save new  model.
+        """
+        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+        run_sharded(src_dir, dst_dir)
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+class SubCliServe(object):
+    """Serve LLMs and interact on terminal or web UI."""
+    def gradio(self,
+               model_path_or_server: str,
+               server_name: str = 'localhost',
+               server_port: int = 6006,
+               batch_size: int = 32,
+               tp: int = 1,
+               restful_api: bool = False):
+        """Serve LLMs with web ui using gradio.
+        Example 1:
+            lmdeploy serve gradio ./workspace
+        Example 2:
+            lmdeploy serve gradio http://localhost:23333
+            --server_name localhost
+            --server_port 6006
+            --restful_api True
+        Example 3:
+            lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+        Args:
+            model_path_or_server (str): the path of the deployed model or the
+                tritonserver URL or restful api URL. The former is for directly
+                running service with gradio. The latter is for running with
+                tritonserver by default. If the input URL is restful api.
+                Please enable another flag `restful_api`.
+            server_name (str): the ip address of gradio server
+            server_port (int): the port of gradio server
+            batch_size (int): batch size for running Turbomind directly
+            tp (int): tensor parallel for Turbomind
+            restful_api (bool): a flag for model_path_or_server
+        """
+        from lmdeploy.serve.gradio.app import run
+        run(model_path_or_server,
+            server_name=server_name,
+            server_port=server_port,
+            batch_size=batch_size,
+            tp=tp,
+            restful_api=restful_api)
+    def api_server(self,
+                   model_path: str,
+                   server_name: str = 'localhost',
+                   server_port: int = 23333,
+                   instance_num: int = 32,
+                   tp: int = 1,
+                   allow_origins: List[str] = ['*'],
+                   allow_credentials: bool = True,
+                   allow_methods: List[str] = ['*'],
+                   allow_headers: List[str] = ['*']):
+        """Serve LLMs with restful api using fastapi.
+        Args:
+            model_path (str): the path of the deployed model
+            server_name (str): host ip for serving
+            server_port (int): server port
+            instance_num (int): number of instances of turbomind model
+            tp (int): tensor parallel
+            allow_origins (List[str]): a list of allowed origins for CORS
+            allow_credentials (bool): whether to allow credentials for CORS
+            allow_methods (List[str]): a list of allowed HTTP methods for CORS
+            allow_headers (List[str]): a list of allowed HTTP headers for CORS
+        """
+        from lmdeploy.serve.openai.api_server import main as run_api_server
+        run_api_server(model_path,
+                       server_name=server_name,
+                       server_port=server_port,
+                       instance_num=instance_num,
+                       tp=tp,
+                       allow_origins=allow_origins,
+                       allow_credentials=allow_credentials,
+                       allow_methods=allow_methods,
+                       allow_headers=allow_headers)
+    def api_client(self, restful_api_url: str, session_id: int = 0):
+        """Interact with restful api server in terminal.
+        Args:
+            restful_api_url: The restful api URL.
+            session_id: The identical id of a session.
+        """
+        from lmdeploy.serve.openai.api_client import main as run_api_client
+        run_api_client(restful_api_url, session_id=session_id)
+    def triton_client(self,
+                      tritonserver_addr: str,
+                      session_id: int = 1,
+                      cap: str = 'chat',
+                      stream_output: bool = True,
+                      **kwargs):
+        """Interact with Triton Server using gRPC protocol.
+        Args:
+            tritonserver_addr (str): the address in format "ip:port" of
+              triton inference server
+            session_id (int): the identical id of a session
+            cap (str): the capability of a model. For example, codellama
+                has the ability among ['completion', 'infill', 'instruct',
+                'python']
+            stream_output (bool): indicator for streaming output or not
+            **kwargs (dict): other arguments for initializing model's
+                chat template
+        """
+        from lmdeploy.serve.client import main as run_triton_client
+        run_triton_client(
+            tritonserver_addr,
+            session_id=session_id,
+            cap=cap,
+            stream_output=stream_output,
+            **kwargs,
+        )
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
@@ -2,7 +2,6 @@
 from pathlib import Path
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                        load_checkpoint_in_model)
@@ -81,5 +80,6 @@ def auto_awq(model: str,
 if __name__ == '__main__':
+    import fire
    fire.Fire(auto_awq)
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -2,7 +2,6 @@
 from pathlib import Path
-import fire
 import torch
 from accelerate import (infer_auto_device_map, init_empty_weights,
                        load_checkpoint_in_model)
@@ -107,4 +106,6 @@ def calibrate(model: str,
 if __name__ == '__main__':
+    import fire
    fire.Fire(calibrate)