Unverified Commit 0cc48011 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add webui (#27)

* add webui

* update readme

* resolve comments

* readme
parent cc93136e
......@@ -158,6 +158,13 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
```
## Inference with Web UI
```shell
python3 llmdeploy/webui/app.py {server_ip_addresss}:33337 model_name
```
## User Guide
## Quantization
In fp16 mode, kv_cache int8 quantization can be enabled, and a single card can serve more users.
......
......@@ -146,6 +146,11 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
```
## 使用浏览器推理
```shell
python3 llmdeploy/webui/app.py {server_ip_addresss}:33337 model_name
```
## 量化部署
在 fp16 模式下,可以开启 kv_cache int8 量化,单卡可服务更多用户。
首先执行量化脚本,量化参数存放到 `deploy.py` 转换的 weight 目录下。
......
# flake8: noqa
from functools import partial
import threading
import fire
import gradio as gr
import os
from strings import ABSTRACT, TITLE
from styles import PARENT_BLOCK_CSS
from llmdeploy.serve.fastertransformer.chatbot import Chatbot
def chat_stream(instruction,
state_chatbot,
llama_chatbot,
model_name: str = None):
bot_summarized_response = ''
model_type = 'fastertransformer'
state_chatbot = state_chatbot + [(instruction, None)]
session_id = threading.current_thread().ident
bot_response = llama_chatbot.stream_infer(
session_id, instruction, f'{session_id}-{len(state_chatbot)}')
yield (state_chatbot, state_chatbot, f'{bot_summarized_response}'.strip())
for status, tokens, _ in bot_response:
if state_chatbot[-1][-1] is None or model_type != 'fairscale':
state_chatbot[-1] = (state_chatbot[-1][0], tokens)
else:
state_chatbot[-1] = (state_chatbot[-1][0],
state_chatbot[-1][1] + tokens
) # piece by piece
yield (state_chatbot, state_chatbot,
f'{bot_summarized_response}'.strip())
yield (state_chatbot, state_chatbot, f'{bot_summarized_response}'.strip())
def reset_textbox():
return gr.Textbox.update(value='')
def reset_everything_func(instruction_txtbox, state_chatbot, llama_chatbot,
triton_server_addr, model_name):
state_chatbot = []
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
llama_chatbot = Chatbot(
triton_server_addr, model_name, log_level=log_level, display=True)
return (
llama_chatbot,
state_chatbot,
state_chatbot,
gr.Textbox.update(value=''),
)
def cancel_func(instruction_txtbox, state_chatbot, llama_chatbot):
session_id = llama_chatbot._session.session_id
llama_chatbot.cancel(session_id)
return (
llama_chatbot,
state_chatbot,
)
def run(triton_server_addr: str,
model_name: str,
server_name: str = 'localhost',
server_port: int = 6006):
with gr.Blocks(css=PARENT_BLOCK_CSS, theme='ParityError/Anime') as demo:
chat_interface = partial(chat_stream, model_name=model_name)
reset_everything = partial(
reset_everything_func,
model_name=model_name,
triton_server_addr=triton_server_addr)
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
llama_chatbot = gr.State(
Chatbot(
triton_server_addr,
model_name,
log_level=log_level,
display=True))
state_chatbot = gr.State([])
with gr.Column(elem_id='col_container'):
gr.Markdown(f'## {TITLE}\n\n\n{ABSTRACT}')
# with gr.Accordion('Context Setting', open=False):
# hidden_txtbox = gr.Textbox(
# placeholder='', label='Order', visible=False)
chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
instruction_txtbox = gr.Textbox(
placeholder='What do you want to say to AI?',
label='Instruction')
with gr.Row():
cancel_btn = gr.Button(value='Cancel')
reset_btn = gr.Button(value='Reset')
send_event = instruction_txtbox.submit(
chat_interface,
[instruction_txtbox, state_chatbot, llama_chatbot],
[state_chatbot, chatbot],
batch=False,
max_batch_size=1,
)
reset_event = instruction_txtbox.submit(
reset_textbox,
[],
[instruction_txtbox],
)
cancel_btn.click(
cancel_func, [instruction_txtbox, state_chatbot, llama_chatbot],
[llama_chatbot, chatbot],
cancels=[send_event])
reset_btn.click(
reset_everything,
[instruction_txtbox, state_chatbot, llama_chatbot],
[llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
cancels=[send_event])
demo.queue(
concurrency_count=4, max_size=100, api_open=True).launch(
max_threads=10,
share=True,
server_port=server_port,
server_name=server_name,
)
if __name__ == '__main__':
fire.Fire(run)
# flake8: noqa
TITLE = 'LLMDeploy Playground'
ABSTRACT = """
Thanks to [LLM-As-Chatbot](https://github.com/deep-diver/LLM-As-Chatbot), this application was modified from it.
"""
PARENT_BLOCK_CSS = """
#col_container {
width: 95%;
margin-left: auto;
margin-right: auto;
}
#chatbot {
height: 500px;
overflow: auto;
}
.chat_wrap_space {
margin-left: 0.5em
}
"""
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment