Initial commit

a3d43354 · wanglch · a3d43354 · a3d43354 · a3d43354 · a3d43354
Commit a3d43354 authored Aug 08, 2024 by wanglch
11 changed files
--- a/resources/wechat.jpg
+++ b/resources/wechat.jpg
--- a/video_demo/README.md
+++ b/video_demo/README.md
+# Video Demo
+
+[中文版README](./README_zh.md)
+
+This folder contains sample code for running the CogVLM2-Video model.
+
+## Installation
+
+Before executing the code, please make sure that the dependencies in `basic_demo` and the additional dependencies in the current folder have been correctly installed.
+
+```shell
+pip install -r requirements.txt
+```
+
+## CLI call model
+
+Run this code to start a conversation in the command line. Please note that to run this code, the model must be loaded on a GPU
+
+```shell
+CUDA_VISIBLE_DEVICES=0 python cli_demo.py
+```
+
+## Restful API Demo
+
+Run this code to launch a Restful API server:
+
+```shell
+python api_demo.py
+```
+
+This will start a Restful API on the 5000 port. Run following code to make a request to the server:
+```shell
+python test_api.py
+```
+
+## Gradio Demo
+
+After launch the Restful API server, you can run this code to start a Gradio web demo:
+
+```shell
+python gradio_demo.py
+```
+Then open the browser and visit `http://0.0.0.0:7868/` to chat with the model.
\ No newline at end of file
--- a/video_demo/README_zh.md
+++ b/video_demo/README_zh.md
+# Video Demo
+
+[Read this in English.](./README.md)
+
+该文件夹下为运行 CogVLM2-Video 模型的示例代码。
+
+## 安装
+
+在执行代码之前，请您确保已经正确安装了 `basic_demo`中的依赖以及当前文件夹下的额外依赖。
+
+```shell
+pip install -r requirements.txt
+```
+
+## CLI 调用模型
+
+运行本代码以开始在命令行中对话。请注意，运行该代码，模型必须在一张GPU上载入
+
+```shell
+CUDA_VISIBLE_DEVICES=0 python cli_demo.py
+```
+## Restful API
+
+运行以下代码以启动一个 Restful API 服务器：
+
+```shell
+python api_demo.py
+```
+
+这将会在5000端口启动一个 Restful API。运行以下代码以向服务器发送请求：
+```shell
+python test_api.py
+```
+
+## Gradio 演示
+
+在启动 Restful API 服务器后，你可以运行以下代码来启动 Gradio 网页演示：
+
+```shell
+python gradio_demo.py
+```
+然后打开浏览器并访问 `http://0.0.0.0:7868/` 来与模型进行聊天。
\ No newline at end of file
--- a/video_demo/api_demo.py
+++ b/video_demo/api_demo.py
+from flask import Flask, request, jsonify
+import traceback
+
+from inference import predict
+
+app = Flask(__name__)
+
+
+@app.route('/video_qa', methods=['POST'])
+def video_qa():
+    if 'video' not in request.files:
+        return jsonify({'error': 'no video file found'}), 400
+
+    video = request.files['video']
+    if video.filename == '':
+        return jsonify({'error': 'no chosen file'}), 400
+
+    if 'question' not in request.form:
+        question = ""
+    else:
+        question = request.form['question']
+
+    if question is None or question == "" or question == "@Caption":
+        question = "Please describe the video in detail."
+
+    print("Get question:", question)
+
+    if 'temperature' not in request.form:
+        temperature = 0.001
+        print("No temperature found, use default value 0.001")
+    else:
+        temperature = float(request.form['temperature'])
+        print("Get temperature:", temperature)
+
+    try:
+        answer = predict(prompt=question, video_data=video.read(), temperature=temperature)
+        return jsonify(
+            {"answer": answer})
+    except:
+        traceback.print_exc()
+        return jsonify({"error": traceback.format_exc()}), 500
+
+
+if __name__ == '__main__':
+    app.run(debug=False, host="0.0.0.0", port=5000)
--- a/video_demo/cli_video_demo.py
+++ b/video_demo/cli_video_demo.py
+import io
+import numpy as np
+import torch
+from decord import cpu, VideoReader, bridge
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import argparse
+
+MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
+    0] >= 8 else torch.float16
+
+parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
+parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
+args = parser.parse_args()
+
+if 'int4' in MODEL_PATH:
+    args.quant = 4
+
+
+def load_video(video_path, strategy='chat'):
+    bridge.set_bridge('torch')
+    with open(video_path, 'rb') as f:
+        mp4_stream = f.read()
+    num_frames = 24
+
+    if mp4_stream is not None:
+        decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
+    else:
+        decord_vr = VideoReader(video_path, ctx=cpu(0))
+    frame_id_list = None
+    total_frames = len(decord_vr)
+    if strategy == 'base':
+        clip_end_sec = 60
+        clip_start_sec = 0
+        start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
+        end_frame = min(total_frames,
+                        int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
+        frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
+    elif strategy == 'chat':
+        timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
+        timestamps = [i[0] for i in timestamps]
+        max_second = round(max(timestamps)) + 1
+        frame_id_list = []
+        for second in range(max_second):
+            closest_num = min(timestamps, key=lambda x: abs(x - second))
+            index = timestamps.index(closest_num)
+            frame_id_list.append(index)
+            if len(frame_id_list) >= num_frames:
+                break
+    video_data = decord_vr.get_batch(frame_id_list)
+    video_data = video_data.permute(3, 0, 1, 2)
+    return video_data
+
+
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    # padding_side="left"
+)
+
+if torch.cuda.is_available() and torch.cuda.get_device_properties(0).total_memory < 48 * 1024 ** 3 and not args.quant:
+    print("GPU memory is less than 48GB. Please use cli_demo_multi_gpus.py or pass `--quant 4` or `--quant 8`.")
+    exit()
+
+# Load the model
+if args.quant == 4:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True,
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=TORCH_TYPE,
+        ),
+        low_cpu_mem_usage=True
+    ).eval()
+elif args.quant == 8:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True,
+        quantization_config=BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_4bit_compute_dtype=TORCH_TYPE,
+        ),
+        low_cpu_mem_usage=True
+    ).eval()
+else:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True
+    ).eval().to(DEVICE)
+
+while True:
+    strategy = 'base' if 'cogvlm2-video-llama3-base' in MODEL_PATH else 'chat'
+    print(f"using with {strategy} model")
+    video_path = input("video path >>>>> ")
+    if video_path == '':
+        print('You did not enter video path, the following will be a plain text conversation.')
+        video = None
+    else:
+        video = load_video(video_path, strategy=strategy)
+
+    history = []
+    while True:
+        query = input("Human:")
+        if query == "clear":
+            break
+
+        inputs = model.build_conversation_input_ids(
+            tokenizer=tokenizer,
+            query=query,
+            images=[video],
+            history=history,
+            template_version=strategy
+        )
+
+        inputs = {
+            'input_ids': inputs['input_ids'].unsqueeze(0).to(DEVICE),
+            'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to(DEVICE),
+            'attention_mask': inputs['attention_mask'].unsqueeze(0).to(DEVICE),
+            'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
+        }
+        gen_kwargs = {
+            "max_new_tokens": 2048,
+            "pad_token_id": 128002,
+            "top_k": 1,
+            "do_sample": True,
+            "top_p": 0.1,
+            "temperature": 0.1,
+        }
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **gen_kwargs)
+            outputs = outputs[:, inputs['input_ids'].shape[1]:]
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            print("\nCogVLM2-Video:", response)
+        history.append((query, response))
--- a/video_demo/gradio_demo.py
+++ b/video_demo/gradio_demo.py
+import gradio as gr
+import requests
+
+
+def load_video_data(video_path):
+    with open(video_path, 'rb') as file:
+        video_data = file.read()
+    return video_data
+
+
+class ChatAgent:
+    def __init__(self):
+        pass
+
+    def answer(self, video_path, prompt, max_new_tokens, num_beams, temperature):
+        url = 'http://127.0.0.1:5000/video_qa'
+        files = {'video': open(video_path, 'rb')}
+        data = {'question': prompt, 'temperature': temperature}
+        response = requests.post(url, files=files, data=data)
+        if response.status_code != 200:
+            return f"Something went wrong: {response.text}"
+        else:
+            return response.json()["answer"]
+
+
+def gradio_reset():
+    return (
+        None,
+        gr.update(value=None, interactive=True),
+        gr.update(placeholder='Please upload your video first', interactive=False),
+        gr.update(value="Upload & Start Chat", interactive=True),
+    )
+
+
+def upload_video(gr_video):
+    if gr_video is None:
+        return None, gr.update(interactive=True, placeholder='Please upload video/image first!'), gr.update(
+            interactive=True)
+    else:
+        print(f"Get video: {gr_video}")
+        return (
+            gr.update(interactive=True),
+            gr.update(interactive=True, placeholder='Type and press Enter'),
+            gr.update(value="Start Chatting", interactive=False)
+        )
+
+
+def gradio_ask(user_message, chatbot):
+    if len(user_message) == 0:
+        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot
+    chatbot = chatbot + [[user_message, None]]
+    return '', chatbot
+
+
+def gradio_answer(video_path, chatbot, num_beams, temperature):
+    if len(chatbot) == 0 or video_path is None:
+        return chatbot
+
+    response = agent.answer(video_path=video_path, prompt=chatbot[-1][0], max_new_tokens=200, num_beams=num_beams,
+                            temperature=temperature)
+    print(f"Question: {chatbot[-1][0]} Answer: {response}")
+    chatbot[-1][1] = response
+    return chatbot
+
+
+agent = ChatAgent()
+
+
+def main():
+    with gr.Blocks(title="VideoHub",
+                   css="#chatbot {overflow:auto; height:500px;} #InputVideo {overflow:visible; height:320px;} footer {visibility: none}") as demo:
+        with gr.Row():
+            with gr.Column(scale=0.5, visible=True) as video_upload:
+                with gr.Tab("Video", elem_id='video_tab'):
+                    up_video = gr.Video(interactive=True, include_audio=True, elem_id="video_upload", height=360)
+
+                upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.1,
+                    step=0.1,
+                    interactive=True,
+                    label="Temperature",
+                )
+                num_beams = gr.Slider(
+                    minimum=1,
+                    maximum=5,
+                    value=1,
+                    step=1,
+                    interactive=True,
+                    label="beam search numbers",
+                )
+
+            with gr.Column(visible=True) as input_raws:
+                chatbot = gr.Chatbot(elem_id="chatbot", label='VideoHub')
+                with gr.Row():
+                    with gr.Column(scale=0.7):
+                        text_input = gr.Textbox(show_label=False, placeholder='Please upload your video first',
+                                                interactive=False, container=False)
+                    with gr.Column(scale=0.15, min_width=0):
+                        run = gr.Button("💭Send")
+                    with gr.Column(scale=0.15, min_width=0):
+                        clear = gr.Button("🔄Clear")
+
+        upload_button.click(upload_video, [up_video],
+                            [up_video, text_input, upload_button])
+
+        text_input.submit(gradio_ask, [text_input, chatbot],
+                          [text_input, chatbot]).then(
+            gradio_answer, [up_video, chatbot, num_beams, temperature], [chatbot]
+        )
+        run.click(gradio_ask, [text_input, chatbot], [text_input, chatbot]).then(
+            gradio_answer, [up_video, chatbot, num_beams, temperature], [chatbot]
+        )
+        run.click(lambda: "", None, text_input)
+        clear.click(gradio_reset, [],
+                    [chatbot, up_video, text_input, upload_button], queue=False)
+    demo.launch(server_name="0.0.0.0", server_port=7868)
+
+
+if __name__ == '__main__':
+    main()
--- a/video_demo/inference.py
+++ b/video_demo/inference.py
+import io
+import numpy as np
+import torch
+from decord import cpu, VideoReader, bridge
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import argparse
+
+MODEL_PATH = "THUDM/cogvlm2-video-llama3-chat"
+
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
+    0] >= 8 else torch.float16
+
+parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
+parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
+args = parser.parse_args([])
+
+
+def load_video(video_data, strategy='chat'):
+    bridge.set_bridge('torch')
+    mp4_stream = video_data
+    num_frames = 24
+    decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
+
+    frame_id_list = None
+    total_frames = len(decord_vr)
+    if strategy == 'base':
+        clip_end_sec = 60
+        clip_start_sec = 0
+        start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
+        end_frame = min(total_frames,
+                        int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
+        frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
+    elif strategy == 'chat':
+        timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
+        timestamps = [i[0] for i in timestamps]
+        max_second = round(max(timestamps)) + 1
+        frame_id_list = []
+        for second in range(max_second):
+            closest_num = min(timestamps, key=lambda x: abs(x - second))
+            index = timestamps.index(closest_num)
+            frame_id_list.append(index)
+            if len(frame_id_list) >= num_frames:
+                break
+
+        # while len(frame_id_list) < num_frames:
+        #     frame_id_list.append(frame_id_list[-1])
+
+    video_data = decord_vr.get_batch(frame_id_list)
+    video_data = video_data.permute(3, 0, 1, 2)
+    return video_data
+
+
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+    # padding_side="left"
+)
+
+
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=TORCH_TYPE,
+    trust_remote_code=True
+).eval().to(DEVICE)
+
+
+def predict(prompt, video_data, temperature):
+    strategy = 'chat'
+
+    video = load_video(video_data, strategy=strategy)
+
+    history = []
+    query = prompt
+    inputs = model.build_conversation_input_ids(
+        tokenizer=tokenizer,
+        query=query,
+        images=[video],
+        history=history,
+        template_version=strategy
+    )
+    inputs = {
+        'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
+        'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
+        'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
+        'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
+    }
+    gen_kwargs = {
+        "max_new_tokens": 2048,
+        "pad_token_id": 128002,
+        "top_k": 1,
+        "do_sample": False,
+        "top_p": 0.1,
+        "temperature": temperature,
+    }
+    with torch.no_grad():
+        outputs = model.generate(**inputs, **gen_kwargs)
+        outputs = outputs[:, inputs['input_ids'].shape[1]:]
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response
--- a/video_demo/requirements.txt
+++ b/video_demo/requirements.txt
+decord>=0.6.0
+#根据https://download.pytorch.org/whl/torch/，python版本为[3.8,3.11]
+torch==2.1.0
+torchvision== 0.16.0
+pytorchvideo==0.1.5
+xformers
+transformers==4.42.4
+#如果运行提示transformers错误，则可能是transformers代码问题，拉取最新的试下
+#git+https://github.com/huggingface/transformers.git
+huggingface-hub>=0.23.0
+pillow
+chainlit>=1.0
+pydantic>=2.7.1
+timm>=0.9.16
+openai>=1.30.1
+loguru>=0.7.2
+pydantic>=2.7.1
+einops
+sse-starlette>=2.1.0
+flask
+gunicorn
+gevent
+requests
+gradio
--- a/video_demo/test_api.py
+++ b/video_demo/test_api.py
+import requests
+url = 'http://127.0.0.1:5000/video_qa'
+video_file = "test.mp4"
+question = "Describe this video in detail."
+temperature=0.2
+files = {'video': open(video_file, 'rb')}
+data = {'question': question,'temperature': temperature}
+response = requests.post(url, files=files, data=data)
+print(response.json()["answer"])
--- a/web_demo.py
+++ b/web_demo.py
+"""
+This is a simple chat demo using CogVLM2 model in ChainLit.
+"""
+import os
+import dataclasses
+from typing import List
+from PIL import Image
+import chainlit as cl
+from chainlit.input_widget import Slider
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from huggingface_hub.inference._generated.types import TextGenerationStreamOutput, TextGenerationStreamOutputToken
+import threading
+import torch
+import os
+
+os.environ["HIP_VISIBLE_DEVICES"] = "7"
+
+MODEL_PATH = './cogvlm2-llama3-chinese-chat-19B'
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
+    0] >= 8 else torch.float16
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+
+quant = int(os.environ.get('QUANT', 0))
+if 'int4' in MODEL_PATH:
+    quant = 4
+print(f'Quant = {quant}')
+
+# Load the model
+if quant == 4:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True,
+        load_in_4bit=True,
+        low_cpu_mem_usage=True
+    ).eval()
+elif quant == 8:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True,
+        load_in_8bit=True,  # Assuming transformers support this argument; check documentation if not
+        low_cpu_mem_usage=True
+    ).eval()
+else:
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=TORCH_TYPE,
+        trust_remote_code=True
+    ).eval().to(DEVICE)
+
+@cl.on_chat_start
+def on_chat_start():
+    print("Welcome use CogVLM2 chat demo")
+
+
+async def get_response(query, history, gen_kwargs, images=None):
+    if images is None:
+        input_by_model = model.build_conversation_input_ids(
+            tokenizer,
+            query=query,
+            history=history,
+            template_version='chat'
+        )
+    else:
+        input_by_model = model.build_conversation_input_ids(
+            tokenizer,
+            query=query,
+            history=history,
+            images=images[-1:],  # only use the last image, CogVLM2 only support one image
+            template_version='chat'
+        )
+
+    inputs = {
+        'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
+        'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
+        'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
+        'images': [[input_by_model['images'][0].to(DEVICE).to(TORCH_TYPE)]] if images is not None else None,
+    }
+
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs['streamer'] = streamer
+    gen_kwargs = {**gen_kwargs, **inputs}
+    with torch.no_grad():
+        thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+        thread.start()
+        for next_text in streamer:
+            yield TextGenerationStreamOutput(
+                index=0,
+                token=TextGenerationStreamOutputToken(
+                    id=0,
+                    logprob=0,
+                    text=next_text,
+                    special=False,
+                )
+            )
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    roles: List[str]
+    messages: List[List[str]]
+    version: str = "Unknown"
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_prompt(self):
+        if not self.messages:
+            return None, []
+
+        last_role, last_msg = self.messages[-2]
+        if isinstance(last_msg, tuple):
+            query, _ = last_msg
+        else:
+            query = last_msg
+
+        history = []
+        for role, msg in self.messages[:-2]:
+            if isinstance(msg, tuple):
+                text, _ = msg
+            else:
+                text = msg
+
+            if role == "USER":
+                history.append((text, ""))
+            else:
+                if history:
+                    history[-1] = (history[-1][0], text)
+
+        return query, history
+
+    def get_images(self):
+        for role, msg in reversed(self.messages):
+            if isinstance(msg, tuple):
+                msg, image = msg
+                if image is None:
+                    continue
+                if image.mode != 'RGB':
+                    image = image.convert('RGB')
+                width, height = image.size
+                if width > 1344 or height > 1344:
+                    max_len = 1344
+                    aspect_ratio = width / height
+                    if width > height:
+                        new_width = max_len
+                        new_height = int(new_width / aspect_ratio)
+                    else:
+                        new_height = max_len
+                        new_width = int(new_height * aspect_ratio)
+                    image = image.resize((new_width, new_height))
+                return [image]
+        return None
+
+    def copy(self):
+        return Conversation(
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            version=self.version,
+        )
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+            }
+        return {
+            "roles": self.roles,
+            "messages": self.messages,
+        }
+
+
+default_conversation = Conversation(
+    roles=("USER", "ASSISTANT"),
+    messages=()
+)
+
+
+async def request(conversation: Conversation, settings):
+    gen_kwargs = {
+        "temperature": settings["temperature"],
+        "top_p": settings["top_p"],
+        "max_new_tokens": int(settings["max_token"]),
+        "top_k": int(settings["top_k"]),
+        "do_sample": True,
+    }
+    query, history = conversation.get_prompt()
+    images = conversation.get_images()
+
+    chainlit_message = cl.Message(content="", author="CogVLM2")
+    text = ""
+    async for response in get_response(query, history, gen_kwargs, images):
+        output = response.token.text
+        text += output
+        conversation.messages[-1][-1] = text
+        await chainlit_message.stream_token(text, is_sequence=True)
+
+    await chainlit_message.send()
+    return conversation
+
+
+@cl.on_chat_start
+async def start():
+    settings = await cl.ChatSettings(
+        [
+            Slider(id="temperature", label="Temperature", initial=0.5, min=0.01, max=1, step=0.05),
+            Slider(id="top_p", label="Top P", initial=0.7, min=0, max=1, step=0.1),
+            Slider(id="top_k", label="Top K", initial=5, min=0, max=50, step=1),
+            Slider(id="max_token", label="Max output tokens", initial=2048, min=0, max=8192, step=1),
+        ]
+    ).send()
+
+    conversation = default_conversation.copy()
+
+    cl.user_session.set("conversation", conversation)
+    cl.user_session.set("settings", settings)
+
+
+@cl.on_settings_update
+async def setup_agent(settings):
+    cl.user_session.set("settings", settings)
+
+
+@cl.on_message
+async def main(message: cl.Message):
+    image = next(
+        (
+            Image.open(file.path)
+            for file in message.elements or []
+            if "image" in file.mime and file.path is not None
+        ),
+        None,
+    )
+
+    conv = cl.user_session.get("conversation")  # type: Conversation
+    settings = cl.user_session.get("settings")
+
+    text = message.content
+
+    conv_message = (text, image)
+    conv.append_message(conv.roles[0], conv_message)
+    conv.append_message(conv.roles[1], None)
+
+    conv = await request(conv, settings)
+    cl.user_session.set("conversation", conv)
\ No newline at end of file
--- a/web_demo.sh
+++ b/web_demo.sh
+chainlit run web_demo.py
\ No newline at end of file