Add webui (#27)

* add webui * update readme * resolve comments * readme

Add webui (#27)
* add webui * update readme * resolve comments * readme
0cc48011 · AllentDan · GitHub · cc93136e · 0cc48011 · 0cc48011
Unverified Commit 0cc48011 authored Jun 29, 2023 by AllentDan Committed by GitHub Jun 29, 2023
6 changed files
--- a/README.md
+++ b/README.md
@@ -158,6 +158,13 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
 ```

+## Inference with Web UI
+
+```shell
+python3 llmdeploy/webui/app.py {server_ip_addresss}:33337 model_name
+```
+
+## User Guide
 ## Quantization

 In fp16 mode, kv_cache int8 quantization can be enabled, and a single card can serve more users.

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -146,6 +146,11 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
 python3 llmdeploy/serve/client.py {server_ip_addresss}:33337 1
 ```

+## 使用浏览器推理
+
+```shell
+python3 llmdeploy/webui/app.py {server_ip_addresss}:33337 model_name
+```
 ## 量化部署
 在 fp16 模式下，可以开启 kv_cache int8 量化，单卡可服务更多用户。
 首先执行量化脚本，量化参数存放到 `deploy.py` 转换的 weight 目录下。

--- a/llmdeploy/webui/__init__.py
+++ b/llmdeploy/webui/__init__.py
--- a/llmdeploy/webui/app.py
+++ b/llmdeploy/webui/app.py
+# flake8: noqa
+from functools import partial
+import threading
+
+import fire
+import gradio as gr
+import os
+from strings import ABSTRACT, TITLE
+from styles import PARENT_BLOCK_CSS
+
+from llmdeploy.serve.fastertransformer.chatbot import Chatbot
+
+
+def chat_stream(instruction,
+                state_chatbot,
+                llama_chatbot,
+                model_name: str = None):
+    bot_summarized_response = ''
+    model_type = 'fastertransformer'
+    state_chatbot = state_chatbot + [(instruction, None)]
+    session_id = threading.current_thread().ident
+    bot_response = llama_chatbot.stream_infer(
+        session_id, instruction, f'{session_id}-{len(state_chatbot)}')
+
+    yield (state_chatbot, state_chatbot, f'{bot_summarized_response}'.strip())
+
+    for status, tokens, _ in bot_response:
+        if state_chatbot[-1][-1] is None or model_type != 'fairscale':
+            state_chatbot[-1] = (state_chatbot[-1][0], tokens)
+        else:
+            state_chatbot[-1] = (state_chatbot[-1][0],
+                                 state_chatbot[-1][1] + tokens
+                                 )  # piece by piece
+        yield (state_chatbot, state_chatbot,
+               f'{bot_summarized_response}'.strip())
+
+    yield (state_chatbot, state_chatbot, f'{bot_summarized_response}'.strip())
+
+
+def reset_textbox():
+    return gr.Textbox.update(value='')
+
+
+def reset_everything_func(instruction_txtbox, state_chatbot, llama_chatbot,
+                          triton_server_addr, model_name):
+
+    state_chatbot = []
+    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+    llama_chatbot = Chatbot(
+        triton_server_addr, model_name, log_level=log_level, display=True)
+
+    return (
+        llama_chatbot,
+        state_chatbot,
+        state_chatbot,
+        gr.Textbox.update(value=''),
+    )
+
+
+def cancel_func(instruction_txtbox, state_chatbot, llama_chatbot):
+    session_id = llama_chatbot._session.session_id
+    llama_chatbot.cancel(session_id)
+
+    return (
+        llama_chatbot,
+        state_chatbot,
+    )
+
+
+def run(triton_server_addr: str,
+        model_name: str,
+        server_name: str = 'localhost',
+        server_port: int = 6006):
+    with gr.Blocks(css=PARENT_BLOCK_CSS, theme='ParityError/Anime') as demo:
+        chat_interface = partial(chat_stream, model_name=model_name)
+        reset_everything = partial(
+            reset_everything_func,
+            model_name=model_name,
+            triton_server_addr=triton_server_addr)
+        log_level = os.environ.get('SERVICE_LOG_LEVEL', 'INFO')
+        llama_chatbot = gr.State(
+            Chatbot(
+                triton_server_addr,
+                model_name,
+                log_level=log_level,
+                display=True))
+        state_chatbot = gr.State([])
+
+        with gr.Column(elem_id='col_container'):
+            gr.Markdown(f'## {TITLE}\n\n\n{ABSTRACT}')
+
+            # with gr.Accordion('Context Setting', open=False):
+            #     hidden_txtbox = gr.Textbox(
+            #         placeholder='', label='Order', visible=False)
+
+            chatbot = gr.Chatbot(elem_id='chatbot', label=model_name)
+            instruction_txtbox = gr.Textbox(
+                placeholder='What do you want to say to AI?',
+                label='Instruction')
+            with gr.Row():
+                cancel_btn = gr.Button(value='Cancel')
+                reset_btn = gr.Button(value='Reset')
+
+        send_event = instruction_txtbox.submit(
+            chat_interface,
+            [instruction_txtbox, state_chatbot, llama_chatbot],
+            [state_chatbot, chatbot],
+            batch=False,
+            max_batch_size=1,
+        )
+        reset_event = instruction_txtbox.submit(
+            reset_textbox,
+            [],
+            [instruction_txtbox],
+        )
+
+        cancel_btn.click(
+            cancel_func, [instruction_txtbox, state_chatbot, llama_chatbot],
+            [llama_chatbot, chatbot],
+            cancels=[send_event])
+
+        reset_btn.click(
+            reset_everything,
+            [instruction_txtbox, state_chatbot, llama_chatbot],
+            [llama_chatbot, state_chatbot, chatbot, instruction_txtbox],
+            cancels=[send_event])
+
+    demo.queue(
+        concurrency_count=4, max_size=100, api_open=True).launch(
+            max_threads=10,
+            share=True,
+            server_port=server_port,
+            server_name=server_name,
+        )
+
+
+if __name__ == '__main__':
+    fire.Fire(run)
--- a/llmdeploy/webui/strings.py
+++ b/llmdeploy/webui/strings.py
+# flake8: noqa
+TITLE = 'LLMDeploy Playground'
+
+ABSTRACT = """
+Thanks to [LLM-As-Chatbot](https://github.com/deep-diver/LLM-As-Chatbot), this application was modified from it.
+"""
--- a/llmdeploy/webui/styles.py
+++ b/llmdeploy/webui/styles.py
+PARENT_BLOCK_CSS = """
+#col_container {
+    width: 95%;
+    margin-left: auto;
+    margin-right: auto;
+}
+
+#chatbot {
+    height: 500px;
+    overflow: auto;
+}
+
+.chat_wrap_space {
+    margin-left: 0.5em
+}
+"""