Initial commit

5e887c2c · wanglch · 5e887c2c · 5e887c2c · 5e887c2c · 5e887c2c
Commit 5e887c2c authored May 31, 2024 by wanglch
20 changed files
--- a/finetune/finetune_lora_ds.sh
+++ b/finetune/finetune_lora_ds.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+GPUS_PER_NODE=8
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+
+MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL"  Set the path if you do not want to load from huggingface directly
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path_to_data"
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+torchrun $DISTRIBUTED_ARGS finetune.py \
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --bf16 True \
+    --fix_vit True \
+    --output_dir output_qwen \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --use_lora \
+    --gradient_checkpointing \
+    --deepspeed finetune/ds_config_zero2.json
\ No newline at end of file
--- a/finetune/finetune_lora_single_gpu.sh
+++ b/finetune/finetune_lora_single_gpu.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+
+MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path_to_data"
+
+export CUDA_VISIBLE_DEVICES=0
+
+python finetune.py \
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --bf16 True \
+    --fix_vit True \
+    --output_dir output_qwen \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --gradient_checkpointing \
+    --use_lora
\ No newline at end of file
--- a/finetune/finetune_qlora_ds.sh
+++ b/finetune/finetune_qlora_ds.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+GPUS_PER_NODE=8
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+
+MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path_to_data"
+
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+# Remember to use --fp16 instead of --bf16 due to autogptq
+torchrun $DISTRIBUTED_ARGS finetune.py \
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --fp16 True \
+    --fix_vit True \
+    --output_dir output_qwen \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --use_lora \
+    --q_lora \
+    --gradient_checkpointing \
+    --deepspeed finetune/ds_config_zero2.json
\ No newline at end of file
--- a/finetune/finetune_qlora_single_gpu.sh
+++ b/finetune/finetune_qlora_single_gpu.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+
+MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path_to_data"
+
+export CUDA_VISIBLE_DEVICES=0
+
+# Remember to use --fp16 instead of --bf16 due to autogptq
+python finetune.py \
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --fp16 True \
+    --fix_vit True \
+    --output_dir output_qwen \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 8 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --lazy_preprocess True \
+    --gradient_checkpointing \
+    --use_lora \
+    --q_lora \
+    --deepspeed finetune/ds_config_zero2.json
--- a/label.py
+++ b/label.py
+import tkinter as tk
+from tkinter import filedialog, Label, simpledialog
+from PIL import Image, ImageTk
+import json
+import random
+import string
+import os
+import shutil
+import sys
+ 
+# 创建主窗口
+root = tk.Tk()
+root.title("Image Viewer with Dialogue")
+root.geometry("1000x600")  # 设置窗口默认大小
+ 
+dialogue_entries = []  # 用来存储动态创建的对话输入框
+ 
+# 加载并显示图片的函数
+def load_and_display_image():
+    global image_label
+    global image_path
+    file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.png;*.jpg;*.jpeg")])
+    image_path = file_path
+    if file_path:
+        image = Image.open(file_path)
+        # 根据原图片宽高比确定图像显示区域的宽度
+        width = int((400 / image.height) * image.width)
+        image = image.resize((width, 400), Image.ANTIALIAS)
+        photo = ImageTk.PhotoImage(image)
+        
+        if 'image_label' in globals():
+            image_label.config(image=photo)
+        else:
+            image_label = tk.Label(image_frame, image=photo)
+            image_label.pack(padx=7, pady=7)
+        image_label.image = photo  # keep a reference to the image
+        # 创建添加对话的按钮
+        add_dialogue_button = tk.Button(root, text="Add Dialogue", command=add_dialogue_boxes)
+        add_dialogue_button.pack(side="top", padx=(5, 0), pady=(7, 0))
+ 
+        # 创建保存按钮
+        save_button = tk.Button(root, text="Save", command=save_to_json)
+        save_button.pack(side="top", pady=(7, 0))
+ 
+        # 创建文本提示框和输入框
+        global input_box_1,input_box_2
+        input_box_1 = create_labeled_input(root, "Input with the picture:")
+        input_box_2 = create_labeled_input(root, "Assistant:")
+ 
+ 
+ 
+# 创建左侧的图片显示框架
+image_frame = tk.Frame(root)
+image_frame.pack(side="left", anchor="nw", padx=7, pady=7)
+ 
+# 创建打开图片的按钮
+open_button = tk.Button(root, text="Open Image", command=load_and_display_image)
+open_button.pack(side="top", pady=(7, 0))
+ 
+# 创建文本提示框和输入框的函数
+def create_labeled_input(parent, label_text):
+    label = tk.Label(parent, text=label_text)
+    label.pack(side="top")
+    entry = tk.Entry(parent, width=50)
+    entry.pack(side="top")
+    return entry
+ 
+ 
+ 
+# 动态添加对话输入框的函数
+def add_dialogue_boxes():
+    user_entry = create_labeled_input(root, "User:")
+    assistant_entry = create_labeled_input(root, "Assistant:")
+    dialogue_entries.append((user_entry, assistant_entry))
+ 
+ 
+# 将输入的文本保存为 JSON 文件的函数
+def save_to_json():
+    save_dir = "saves"
+ 
+    # 生成一个10位的随机字符串作为文件名
+    random_filename = ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))
+    
+    # 确保save_dir存在
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+ 
+    # 定义JSON文件的完整路径
+    json_file_path = os.path.join(save_dir, random_filename + '.json')
+    # 定义图片文件的完整路径
+    image_file_path = os.path.join(save_dir, random_filename + '.jpg')
+ 
+ 
+    cat1 = "Picture 1: <img>"
+    cat2 = random_filename
+    cat3 = ".jpg</img>\n"
+    cat4 = input_box_1.get()
+    cat = cat1+cat2+cat3+cat4
+    dialogue_data = {
+        
+        "conversations": []
+    }
+ 
+    dialogue_data["conversations"].append({
+        "from":"user",
+        "value": cat
+    })
+    
+    dialogue_data["conversations"].append({
+        "from": "assistant",
+        "value": input_box_2.get()
+    })
+    for user_entry, assistant_entry in dialogue_entries:
+        dialogue_data["conversations"].append({
+            "from": "user",
+            "value": user_entry.get()
+        })
+        dialogue_data["conversations"].append({
+            "from": "assistant",
+            "assistant": assistant_entry.get()
+        })
+    
+    # 把对话数据保存到JSON文件
+    with open(json_file_path, 'w', encoding='utf-8') as json_file:
+        json.dump(dialogue_data, json_file, ensure_ascii=False, indent=4)
+ 
+    # 把图片文件保存到指定的文件夹
+    if image_path and os.path.isfile(image_path):
+        shutil.copy2(image_path, image_file_path)
+    
+    
+    subprocess.Popen(["python", "test.py"])
+    sys.exit()
+ 
+ 
+ 
+ 
+root.mainloop()
\ No newline at end of file
--- a/merge.py
+++ b/merge.py
+import os
+import json
+from collections import OrderedDict
+import re
+ 
+# 刚才标注完的数据目录，建议把merge.py放在其上一级
+directory_path = 'saves/'
+ 
+# 创建一个空列表来存储合并后的内容
+merged_data = []
+ 
+#建议设置成到时候训练用的数据的绝对路径，方便qwen finetune时遍历
+image_path_prefix = 'pathtoyourimages/'
+ 
+# 正则表达式匹配 <img>*.jpg</img>
+img_pattern = re.compile(r'<img>(.*?\.jpg)</img>')
+ 
+# 递归函数来更新图片路径
+def update_img_paths(obj):
+    if isinstance(obj, OrderedDict):
+        for key, value in obj.items():
+            if isinstance(value, (OrderedDict, list)):
+                update_img_paths(value)
+            elif isinstance(value, str):
+                obj[key] = img_pattern.sub(
+                    f'<img>{image_path_prefix}\\1<img>', value)
+    elif isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if isinstance(item, (OrderedDict, list)):
+                update_img_paths(item)
+            elif isinstance(item, str):
+                obj[i] = img_pattern.sub(
+                    f'<img>{image_path_prefix}\\1<img>', item)
+ 
+# 遍历目录中的所有文件
+for filename in os.listdir(directory_path):
+    # 检查文件是否是JSON文件
+    if filename.endswith('.json'):
+        # 构建完整的文件路径
+        file_path = os.path.join(directory_path, filename)
+        # 打开并读取JSON文件
+        with open(file_path, 'r', encoding='utf-8') as file:
+            try:
+                # 加载JSON内容
+                content = json.load(file, object_pairs_hook=OrderedDict)
+                # 创建一个新的OrderedDict以将'id'字段放在最前面
+                new_content = OrderedDict()
+                new_content['id'] = os.path.splitext(filename)[0]
+                new_content.update(content)
+                update_img_paths(new_content)
+                # 将此内容添加到合并后的数据列表
+                merged_data.append(new_content)
+            except json.JSONDecodeError as e:
+                print(f"Error reading {filename}: {e}")
+ 
+# 指定新JSON文件的名称
+output_filename = 'merged_data.json'
+# 构建完整的输出文件路径
+output_filepath = output_filename
+ 
+# 写入合并后的数据到新的JSON文件
+with open(output_filepath, 'w', encoding='utf-8') as output_file:
+    json.dump(merged_data, output_file, ensure_ascii=False, indent=4)
+ 
+print(f"Merge complete. Combined file created at {output_filepath}")
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode = 674
+# 模型名称
+modelName=qwen-vl_pytorch                
+# 模型描述
+modelDescription=Qwen-VL 大规模视觉语言模型
+# 应用场景
+appScenario=训练,推理,金融,教育,政府,科研,制造,能源,交通
+# 框架类型
+frameType=pytorch
--- a/openai_api.py
+++ b/openai_api.py
+# coding=utf-8
+# Implements API for Qwen-7B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
+# Usage: python openai_api.py
+# Visit http://localhost:8000/docs for documents.
+
+import re
+import copy
+import json
+import time
+from argparse import ArgumentParser
+from contextlib import asynccontextmanager
+from typing import Dict, List, Literal, Optional, Union
+
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.generation import GenerationConfig
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):  # collects GPU memory
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+
+
+class ChatMessage(BaseModel):
+    role: Literal["user", "assistant", "system", "function"]
+    content: Optional[str]
+    function_call: Optional[Dict] = None
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    functions: Optional[List[Dict]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    max_length: Optional[int] = None
+    stream: Optional[bool] = False
+    stop: Optional[List[str]] = None
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Literal["stop", "length", "function_call"]
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]]
+
+
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[
+        Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]
+    ]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+
+
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    global model_args
+    model_card = ModelCard(id="gpt-3.5-turbo")
+    return ModelList(data=[model_card])
+
+
+# To work around that unpleasant leading-\n tokenization issue!
+def add_extra_stop_words(stop_words):
+    if stop_words:
+        _stop_words = []
+        _stop_words.extend(stop_words)
+        for x in stop_words:
+            s = x.lstrip("\n")
+            if s and (s not in _stop_words):
+                _stop_words.append(s)
+        return _stop_words
+    return stop_words
+
+
+def trim_stop_words(response, stop_words):
+    if stop_words:
+        for stop in stop_words:
+            idx = response.find(stop)
+            if idx != -1:
+                response = response[:idx]
+    return response
+
+
+TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""
+
+REACT_INSTRUCTION = """Answer the following questions as best you can. You have access to the following APIs:
+
+{tools_text}
+
+Use the following format:
+
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [{tools_name_text}]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+
+Begin!"""
+
+_TEXT_COMPLETION_CMD = object()
+
+
+#
+# Temporarily, the system role does not work as expected.
+# We advise that you write the setups for role-play in your query,
+# i.e., use the user role instead of the system role.
+#
+# TODO: Use real system role when the model is ready.
+#
+def parse_messages(messages, functions):
+    if all(m.role != "user" for m in messages):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid request: Expecting at least one user message.",
+        )
+
+    messages = copy.deepcopy(messages)
+    default_system = "You are a helpful assistant."
+    system = ""
+    if messages[0].role == "system":
+        system = messages.pop(0).content.lstrip("\n").rstrip()
+        if system == default_system:
+            system = ""
+
+    if functions:
+        tools_text = []
+        tools_name_text = []
+        for func_info in functions:
+            name = func_info.get("name", "")
+            name_m = func_info.get("name_for_model", name)
+            name_h = func_info.get("name_for_human", name)
+            desc = func_info.get("description", "")
+            desc_m = func_info.get("description_for_model", desc)
+            tool = TOOL_DESC.format(
+                name_for_model=name_m,
+                name_for_human=name_h,
+                # Hint: You can add the following format requirements in description:
+                #   "Format the arguments as a JSON object."
+                #   "Enclose the code within triple backticks (`) at the beginning and end of the code."
+                description_for_model=desc_m,
+                parameters=json.dumps(func_info["parameters"], ensure_ascii=False),
+            )
+            tools_text.append(tool)
+            tools_name_text.append(name_m)
+        tools_text = "\n\n".join(tools_text)
+        tools_name_text = ", ".join(tools_name_text)
+        system += "\n\n" + REACT_INSTRUCTION.format(
+            tools_text=tools_text,
+            tools_name_text=tools_name_text,
+        )
+        system = system.lstrip("\n").rstrip()
+
+    dummy_thought = {
+        "en": "\nThought: I now know the final answer.\nFinal answer: ",
+        "zh": "\nThought: 我会作答了。\nFinal answer: ",
+    }
+
+    _messages = messages
+    messages = []
+    for m_idx, m in enumerate(_messages):
+        role, content, func_call = m.role, m.content, m.function_call
+        if content:
+            content = content.lstrip("\n").rstrip()
+        if role == "function":
+            if (len(messages) == 0) or (messages[-1].role != "assistant"):
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Invalid request: Expecting role assistant before role function.",
+                )
+            messages[-1].content += f"\nObservation: {content}"
+            if m_idx == len(_messages) - 1:
+                messages[-1].content += "\nThought:"
+        elif role == "assistant":
+            if len(messages) == 0:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Invalid request: Expecting role user before role assistant.",
+                )
+            last_msg = messages[-1].content
+            last_msg_has_zh = len(re.findall(r"[\u4e00-\u9fff]+", last_msg)) > 0
+            if func_call is None:
+                if functions:
+                    content = dummy_thought["zh" if last_msg_has_zh else "en"] + content
+            else:
+                f_name, f_args = func_call["name"], func_call["arguments"]
+                if not content:
+                    if last_msg_has_zh:
+                        content = f"Thought: 我可以使用 {f_name} API。"
+                    else:
+                        content = f"Thought: I can use {f_name}."
+                content = f"\n{content}\nAction: {f_name}\nAction Input: {f_args}"
+            if messages[-1].role == "user":
+                messages.append(
+                    ChatMessage(role="assistant", content=content.lstrip("\n").rstrip())
+                )
+            else:
+                messages[-1].content += content
+        elif role == "user":
+            messages.append(
+                ChatMessage(role="user", content=content.lstrip("\n").rstrip())
+            )
+        else:
+            raise HTTPException(
+                status_code=400, detail=f"Invalid request: Incorrect role {role}."
+            )
+
+    query = _TEXT_COMPLETION_CMD
+    if messages[-1].role == "user":
+        query = messages[-1].content
+        messages = messages[:-1]
+
+    if len(messages) % 2 != 0:
+        raise HTTPException(status_code=400, detail="Invalid request")
+
+    history = []  # [(Q1, A1), (Q2, A2), ..., (Q_last_turn, A_last_turn)]
+    for i in range(0, len(messages), 2):
+        if messages[i].role == "user" and messages[i + 1].role == "assistant":
+            usr_msg = messages[i].content.lstrip("\n").rstrip()
+            bot_msg = messages[i + 1].content.lstrip("\n").rstrip()
+            if system and (i == len(messages) - 2):
+                usr_msg = f"{system}\n\nQuestion: {usr_msg}"
+                system = ""
+            for t in dummy_thought.values():
+                t = t.lstrip("\n")
+                if bot_msg.startswith(t) and ("\nAction: " in bot_msg):
+                    bot_msg = bot_msg[len(t) :]
+            history.append([usr_msg, bot_msg])
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid request: Expecting exactly one user (or function) role before every assistant role.",
+            )
+    if system:
+        assert query is not _TEXT_COMPLETION_CMD
+        query = f"{system}\n\nQuestion: {query}"
+    return query, history
+
+
+def parse_response(response):
+    func_name, func_args = "", ""
+    i = response.rfind("\nAction:")
+    j = response.rfind("\nAction Input:")
+    k = response.rfind("\nObservation:")
+    if 0 <= i < j:  # If the text has `Action` and `Action input`,
+        if k < j:  # but does not contain `Observation`,
+            # then it is likely that `Observation` is omitted by the LLM,
+            # because the output text may have discarded the stop word.
+            response = response.rstrip() + "\nObservation:"  # Add it back.
+        k = response.rfind("\nObservation:")
+        func_name = response[i + len("\nAction:") : j].strip()
+        func_args = response[j + len("\nAction Input:") : k].strip()
+    if func_name:
+        choice_data = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(
+                role="assistant",
+                content=response[:i],
+                function_call={"name": func_name, "arguments": func_args},
+            ),
+            finish_reason="function_call",
+        )
+        return choice_data
+    z = response.rfind("\nFinal Answer: ")
+    if z >= 0:
+        response = response[z + len("\nFinal Answer: ") :]
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=ChatMessage(role="assistant", content=response),
+        finish_reason="stop",
+    )
+    return choice_data
+
+
+# completion mode, not chat mode
+def text_complete_last_message(history, stop_words_ids):
+    im_start = "<|im_start|>"
+    im_end = "<|im_end|>"
+    prompt = f"{im_start}system\nYou are a helpful assistant.{im_end}"
+    for i, (query, response) in enumerate(history):
+        query = query.lstrip("\n").rstrip()
+        response = response.lstrip("\n").rstrip()
+        prompt += f"\n{im_start}user\n{query}{im_end}"
+        prompt += f"\n{im_start}assistant\n{response}{im_end}"
+    prompt = prompt[: -len(im_end)]
+
+    _stop_words_ids = [tokenizer.encode(im_end)]
+    if stop_words_ids:
+        for s in stop_words_ids:
+            _stop_words_ids.append(s)
+    stop_words_ids = _stop_words_ids
+
+    input_ids = torch.tensor([tokenizer.encode(prompt)]).to(model.device)
+    output = model.generate(input_ids, stop_words_ids=stop_words_ids).tolist()[0]
+    output = tokenizer.decode(output, errors="ignore")
+    assert output.startswith(prompt)
+    output = output[len(prompt) :]
+    output = trim_stop_words(output, ["<|endoftext|>", im_end])
+    print(f"<completion>\n{prompt}\n<!-- *** -->\n{output}\n</completion>")
+    return output
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+
+    stop_words = add_extra_stop_words(request.stop)
+    if request.functions:
+        stop_words = stop_words or []
+        if "Observation:" not in stop_words:
+            stop_words.append("Observation:")
+
+    query, history = parse_messages(request.messages, request.functions)
+
+    if request.stream:
+        if request.functions:
+            raise HTTPException(
+                status_code=400,
+                detail="Invalid request: Function calling is not yet implemented for stream mode.",
+            )
+        # generate = predict(query, history, request.model, stop_words)
+        # return EventSourceResponse(generate, media_type="text/event-stream")
+        raise HTTPException(status_code=400, detail="Stream request is not supported currently.")
+
+    stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
+    if query is _TEXT_COMPLETION_CMD:
+        response = text_complete_last_message(history, stop_words_ids=stop_words_ids)
+    else:
+        response, _ = model.chat(
+            tokenizer,
+            query,
+            history=history,
+            stop_words_ids=stop_words_ids,
+            append_history=False,
+            top_p=request.top_p,
+            temperature=request.temperature,
+        )
+        print(f"<chat>\n{history}\n{query}\n<!-- *** -->\n{response}\n</chat>")
+    response = trim_stop_words(response, stop_words)
+    if request.functions:
+        choice_data = parse_response(response)
+    else:
+        choice_data = ChatCompletionResponseChoice(
+            index=0,
+            message=ChatMessage(role="assistant", content=response),
+            finish_reason="stop",
+        )
+    return ChatCompletionResponse(
+        model=request.model, choices=[choice_data], object="chat.completion"
+    )
+
+
+async def predict(
+    query: str, history: List[List[str]], model_id: str, stop_words: List[str]
+):
+    global model, tokenizer
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(role="assistant"), finish_reason=None
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    current_length = 0
+    stop_words_ids = [tokenizer.encode(s) for s in stop_words] if stop_words else None
+    if stop_words:
+        # TODO: It's a little bit tricky to trim stop words in the stream mode.
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid request: custom stop words are not yet supported for stream mode.",
+        )
+    response_generator = model.chat_stream(
+        tokenizer, query, history=history, stop_words_ids=stop_words_ids
+    )
+    for new_response in response_generator:
+        if len(new_response) == current_length:
+            continue
+
+        new_text = new_response[current_length:]
+        current_length = len(new_response)
+
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0, delta=DeltaMessage(content=new_text), finish_reason=None
+        )
+        chunk = ChatCompletionResponse(
+            model=model_id, choices=[choice_data], object="chat.completion.chunk"
+        )
+        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0, delta=DeltaMessage(), finish_reason="stop"
+    )
+    chunk = ChatCompletionResponse(
+        model=model_id, choices=[choice_data], object="chat.completion.chunk"
+    )
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    yield "[DONE]"
+
+
+def _get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-c",
+        "--checkpoint-path",
+        type=str,
+        default="QWen/QWen-7B-Chat",
+        help="Checkpoint name or path, default to %(default)r",
+    )
+    parser.add_argument(
+        "--cpu-only", action="store_true", help="Run demo with CPU only"
+    )
+    parser.add_argument(
+        "--server-port", type=int, default=8000, help="Demo server port."
+    )
+    parser.add_argument(
+        "--server-name",
+        type=str,
+        default="127.0.0.1",
+        help="Demo server name. Default: 127.0.0.1, which is only visible from the local computer."
+        " If you want other computers to access your server, use 0.0.0.0 instead.",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = _get_args()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.checkpoint_path,
+        trust_remote_code=True,
+        resume_download=True,
+    )
+
+    if args.cpu_only:
+        device_map = "cpu"
+    else:
+        device_map = "auto"
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint_path,
+        device_map=device_map,
+        trust_remote_code=True,
+        resume_download=True,
+    ).eval()
+
+    model.generation_config = GenerationConfig.from_pretrained(
+        args.checkpoint_path,
+        trust_remote_code=True,
+        resume_download=True,
+    )
+
+    uvicorn.run(app, host=args.server_name, port=args.server_port, workers=1)
\ No newline at end of file
--- a/qwen_vl_chat_inference.py
+++ b/qwen_vl_chat_inference.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+# 如果您希望结果可复现，可以设置随机数种子。
+# torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
+model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
+
+query = tokenizer.from_list_format([
+    {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
+    {'text': 'What is the name of the movie in the poster?'},
+])
+
+"""
+ query = tokenizer.from_list_format([
+  {'text': 'Who directed this movie?'},
+])
+response, history = model.chat(tokenizer, query=query, history=history)
+print(response)
+
+"""
+response, history = model.chat(tokenizer, query=query, history=None)
+print(response)
+
--- a/qwen_vl_inference.py
+++ b/qwen_vl_inference.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+import torch
+torch.manual_seed(1234)
+
+tokenizer = AutoTokenizer.from_pretrained("/home/wanglch/projects/Qwen-VL/Qwen-VL-base", trust_remote_code=True)
+
+# 打开bf16精度，A100、H100、RTX3060、RTX3070等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, bf16=True).eval()
+# 打开fp16精度，V100、P100、T4等显卡建议启用以节省显存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# 使用CPU进行推理，需要约32GB内存
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cpu", trust_remote_code=True).eval()
+# 默认gpu进行推理，需要约24GB显存
+model = AutoModelForCausalLM.from_pretrained("/home/wanglch/projects/Qwen-VL/Qwen-VL-base", device_map="cuda", trust_remote_code=True, fp16=True).eval()
+
+# 可指定不同的生成长度、top_p等相关超参（transformers 4.32.0及以上无需执行此操作）
+# model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)
+
+query = tokenizer.from_list_format([
+    {'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, # Either a local path or an url
+    {'text': 'Generate the caption in English with grounding:'},
+])
+inputs = tokenizer(query, return_tensors='pt')
+inputs = inputs.to(model.device)
+pred = model.generate(**inputs)
+response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+print(response)
+# <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
+image = tokenizer.draw_bbox_on_latest_picture(response)
+if image:
+  image.save('2.jpg')
+else:
+  print("no box")
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+transformers
+accelerate
+tiktoken
+einops
+transformers_stream_generator
+scipy
+torchvision
+pillow
+tensorboard
+matplotlib
+tk
+shutilwhich
+deepspeed
\ No newline at end of file
--- a/requirements_openai_api.txt
+++ b/requirements_openai_api.txt
+fastapi
+uvicorn
+openai
+pydantic
+sse_starlette
--- a/requirements_web_demo.txt
+++ b/requirements_web_demo.txt
+gradio
+modelscope
--- a/saves/2.jpg
+++ b/saves/2.jpg
--- a/saves/dcu_result.txt
+++ b/saves/dcu_result.txt
+{'loss': 2.4436, 'learning_rate': 8.535533905932739e-06, 'epoch': 2.0}
+{'loss': 1.7417, 'learning_rate': 5e-06, 'epoch': 2.67}          
+{'loss': 0.6206, 'learning_rate': 1.4644660940672628e-06, 'epoch': 3.0}
+{'loss': 2.3251, 'learning_rate': 0.0, 'epoch': 4.0}             
+{'train_runtime': 642.8468, 'train_samples_per_second': 0.023, 'train_steps_per_second': 0.008, 'train_loss': 1.9149054765701294, 'epoch': 4.0}
\ No newline at end of file
--- a/saves/gpu_result.txt
+++ b/saves/gpu_result.txt
+{'loss': 2.4423, 'learning_rate': 1e-05, 'epoch': 1.0}                                                                               
+{'loss': 2.4423, 'learning_rate': 8.535533905932739e-06, 'epoch': 2.0}                                                               
+{'loss': 1.7389, 'learning_rate': 5e-06, 'epoch': 2.67}                                                                              
+{'loss': 0.6183, 'learning_rate': 1.4644660940672628e-06, 'epoch': 3.0}                                                              
+{'loss': 2.333, 'learning_rate': 0.0, 'epoch': 4.0}                                                                                  
+{'train_runtime': 35.6042, 'train_samples_per_second': 0.421, 'train_steps_per_second': 0.14, 'train_loss': 1.9149360179901123, 'epoch': 4.0}
\ No newline at end of file
--- a/touchstone/README.md
+++ b/touchstone/README.md
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        <a href="../touchstone/README_CN.md">中文</a>&nbsp ｜ &nbspEnglish ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a>｜ &nbsp<a href="../touchstone/README_KO.md">한국어</a>
+</p>
+<br><br>
+
+**TOUCHSTONE** is a comprehensive assessment of multimodal language models, encompassing not only basic recognition and comprehension but also extending to literary creation. By automating the evaluation process and converting multimodal information into text, our TouchStone allows for efficient and accurate assessment of dialogue quality, leveraging the power of advanced language models without the need for manual intervention.
+
+## DATASET
+
+To evaluate the abilities of LVLMs, we construct a diverse and comprehensive dataset that covers five key dimensions: basic descriptive ability, visual recognition ability, visual comprehension ability, visual storytelling ability, and multi-image analysis ability.
+
+- **Basic Descriptive Ability** Image description involves the ability of a model to describe the information contained in an image, including simple and detailed descriptions. Simple descriptions are typically short phrases that describe the main subject and action of the image, while detailed descriptions provide more in-depth information about the image scene, their attributes, and relationships.
+
+- **Visual Recognition Ability** Image recognition is the task of recognizing objects or scenes within an image and inferring relevant information. This area can be further divided into several sub-tasks, including attribute QA, movie/TV recognition, art recognition, landmark recognition, celebrity recognition, emotion recognition, text recognition, object recognition, and structure content recognition. 
+
+- **Visual Comprehension Ability** Image understanding involves the ability of a model to understand the meaning of an image and associated tasks. This area encompasses several sub-tasks, such as style appreciation, abstract image understanding, meme understanding, image analysis, chart analysis, general problem-solving, and reasoning QA.
+
+- **Visual Storytelling Ability**  The visual storytelling ability is the process of literary creation based on visual content, including writing emails, poetry, stories, ads/commodity recommendations, and brainstorming. 
+
+- **Multi-Image Analysis Ability** Multi-image analysis is the task of analyzing and comparing multiple images. This area includes tasks such as comparing two/multiple images, summarizing multiple image information, comparing commodities, and step-by-step analysis of images.
+
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+We comprehensively evaluate the model's ability from five dimensions. As shown in the figure above, an example of 27 subtasks is given. From perception to cognition to creativity, as the difficulty increases, the requirements for models are also getting higher and higher. Currently, LVLM capabilities are in their early stages. Our dataset contains 800+ questions and 27 categories.
+
+## Methods
+
+
+We apply a powerful LLM as a judge to enable automated evaluation. To effectively comprehend the contents of an image, we manually substitute the actual image input with fine-grained textual annotations. By inputting these annotations and corresponding questions to a powerful LLM like GPT4, we obtain reference answers.
+
+For the evaluation of the LVLMs, we provide actual images and questions as input and obtain their respective answers. Finally, we employ GPT4 to score the answers generated by the LVLMs based on the fine-grained annotations and questions. The scoring instructions require the model to assess the usefulness, relevance, and accuracy of the answers, considering the annotations as the content of the images. To ensure fairness in the evaluation, each model's answer is compared against a consistent reference answer from GPT4. The average score of the model in all questions is taken as the final score.
+
+To eliminate the influence of answer position, we perform a second scoring round by swapping the positions of the answers and then compute the average of the two scores obtained. This approach aims to mitigate any bias introduced by the placement of the answers.
+
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+### Evaluation
+
+#### Evaluation in English-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### Evaluation in Chinese-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
--- a/touchstone/README_CN.md
+++ b/touchstone/README_CN.md
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        中文&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a> ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a>
+</p>
+<br><br>
+
+**TOUCHSTONE** 是一种针对多模态语言模型（LVLM）的自动化综合评估方法，评估不仅包括基本的认知和理解，还延伸到文学创作。通过人类注解将多模态信息转换为文本，我们的 TouchStone 可以利用SOTA的语言模型来自动化地完成对LVLMs的多模态对话质量评估。
+
+## 数据集
+
+为了评估 LVLMs 的能力，我们构建了一个多样化且全面的数据集，涵盖五个关键维度：基本描述能力、视觉识别能力、视觉理解能力、视觉叙事能力和多图分析能力。
+
+- **基本描述能力** 图像描述考验模型总结图片信息的能力，包括简单描述和详细描述。 简单描述通常是描述图像的主要内容和关系的简短短语，而详细描述则提供有关图像场景、其属性和关系的更深入的信息。
+
+- **视觉识别能力** 图像识别考察模型提取图像中内容的属性以及关联到知识库的能力。为了考察这方面能力，测试的问题包括属性QA、影视识别、艺术识别、地标识别、名人识别、情感识别、文本识别、物体识别和结构内容识别。
+
+- **视觉理解能力** 图像理解需要模型理解图像内容并完成推理进行相关任务。 这方面包含了例如风格欣赏、抽象图像理解、模因理解、图像分析、图表分析、一般问题解决和推理问答等任务。
+
+- **视觉叙事能力**  视觉叙事能力是基于视觉内容的文学创作能力，包括撰写电子邮件、诗歌、故事、广告/商品推荐、头脑风暴等。 
+
+- **多图分析能力** 多图分析是分析和比较多幅图像的任务。该领域包括比较两个/多个图像、总结多个图像信息、比较商品以及逐步分析图像等任务。
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+我们从五个维度综合评估了模型的能力。 如上图所示，给出了27个子任务的示例。 从感知到认知，再到创造力，随着难度的增加，对模型的要求也越来越高。 目前，LVLM的能力还处于早期阶段。 我们的数据集包含800+道题目、27个类别。
+
+## 测评方式
+
+我们应用SOTA的LLM进行自动化评估。 为了有效地理解图像的内容，我们人工用细粒度的文本注释替换实际的图像输入。 通过将这些注释和相应的问题输入到像GPT4这样强LLM中，我们可以获得参考答案。
+
+对于待测评的LVLM，我们提供实际图像和问题作为输入并获得各自的答案。 最后，我们使用GPT4根据细粒度注释和问题对LVLM生成的答案进行评分。 评分指令要求模型评估答案的有用性、相关性和准确性，并将人工注解视为图像的内容。 为了确保评估的公平性，每个模型的答案都会与 GPT4生成的参考答案进行比较。 模型在所有问题上的平均得分作为最终得分。
+
+为了消除答案位置的影响，我们通过交换答案的位置来进行第二轮评分，然后计算获得的两次分数的平均值。
+
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+
+## 测评结果
+
+#### 英文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### 中文版本测评
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
--- a/touchstone/README_JA.md
+++ b/touchstone/README_JA.md
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        <a href="touchstone/README_CN.md">中文</a>&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a>｜ &nbsp日本語
+</p>
+<br><br>
+
+**TOUCHSTONE** は、マルチモーダル言語モデルの包括的な評価であり、基本的な認識や理解だけでなく、文学的な創作にまで及びます。評価プロセスを自動化し、マルチモーダル情報をテキストに変換することで、私達の TouchStone は、人手を介することなく高度な言語モデルの力を活用し、対話の質を効率的かつ正確に評価することができます。
+
+## DATASET
+
+LVLMの能力を評価するために、基本的な記述能力、視覚認識能力、視覚理解能力、視覚ストーリーテリング能力、複数画像解析能力の5つの主要な次元をカバーする多様で包括的なデータセットを構築する。
+
+- **基本的描写力** 画像記述には、単純な記述と詳細な記述を含め、画像に含まれる情報を記述するモデルの能力が含まれる。単純な記述は、通常、画像の主な主題とアクションを記述する短いフレーズであり、詳細な記述は、画像のシーン、それらの属性、および関係についてのより詳細な情報を提供します。
+
+- **視覚認識能力** 画像認識とは、画像内のオブジェクトやシーンを認識し、関連情報を推論するタスクである。この分野はさらに、属性QA、映画/テレビ認識、アート認識、ランドマーク認識、有名人認識、感情認識、テキスト認識、オブジェクト認識、構造コンテンツ認識など、いくつかのサブタスクに分けることができる。
+
+- **視覚理解能力** 画像理解とは、モデルが画像の意味や関連するタスクを理解する能力のことである。この分野には、スタイル理解、抽象画像理解、ミーム理解、画像分析、チャート分析、一般的な問題解決、推論QAなど、いくつかのサブタスクが含まれる。
+
+- **視覚的ストーリーテリング能力** ビジュアルストーリーテリング能力とは、メール、詩、物語、広告／商品推薦、ブレーンストーミングの執筆など、ビジュアルコンテンツに基づいた文学創作のプロセスである。
+
+- **マルチ画像解析能力** 複数画像解析とは、複数の画像を解析・比較する作業である。この分野には、2つまたは複数の画像を比較する、複数の画像情報を要約する、商品を比較する、画像を段階的に分析するなどのタスクが含まれます。
+
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+モデルの能力を 5 つの次元から総合的に評価する。上図のように、27 のサブタスクの例を示す。知覚から認知、創造性まで、難易度が上がるにつれて、モデルに求められる要件もどんどん高くなっている。現在、LVLM の機能は初期段階にある。我々のデータセットには 800 以上の質問と 27 のカテゴリーが含まれている。
+
+## 方法
+
+
+自動評価を可能にするために、強力な LLM を判定器として適用する。画像の内容を効果的に理解するために、実際の画像入力をきめ細かいテキスト注釈に手動で置き換える。これらの注釈と対応する質問を GPT4 のような強力な LLM に入力することで、参照解答を得る。
+
+LVLMの評価には、実際の画像と質問を入力として与え、それぞれの回答を得る。最後に、GPT4を用いて、LVLMが生成した回答を、細かいアノテーションと質問に基づいてスコアリングする。スコアリングの指示は、注釈を画像の内容とみなして、回答の有用性、関連性、正確性を評価するようモデルに要求する。評価の公平性を確保するため、各モデルの回答はGPT4の一貫した参照回答と比較されます。全問題におけるモデルの平均スコアを最終スコアとする。
+
+解答位置の影響を排除するために、解答位置を入れ替えて2回目の採点ラウンドを行い、得られた2つのスコアの平均を計算します。このアプローチは、解答の配置によって生じるバイアスを軽減することを目的としています。
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+### 評価
+
+#### 英語ベースのマルチモーダル対話における評価
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### 中国語ベースのマルチモーダル対話における評価
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+
--- a/touchstone/README_KO.md
+++ b/touchstone/README_KO.md
+<br>
+
+<p align="center">
+    <img src="../assets/touchstone_logo.png" width="300"/>
+<p>
+<br>
+
+<p align="center">
+        <a href="../touchstone/README_CN.md">中文</a>&nbsp ｜ &nbspEnglish ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a> ｜ &nbsp<a href="../touchstone/README_KO.md">한국어</a> 
+</p>
+<br><br>
+
+**터치스톤, TOUCHSTONE**은 기본적인 인식과 이해력뿐만 아니라 문학 창작까지 아우르는 종합적인 멀티모달 언어 모델 평가입니다. 평가 프로세스를 자동화하고 멀티모달 정보를 텍스트로 변환하는 터치스톤은 수동 개입 없이도 고급 언어 모델의 성능을 활용하여 대화 품질을 효율적이고 정확하게 평가할 수 있도록 지원합니다.
+
+## DATASET
+
+머신러닝의 능력을 평가하기 위해 기본 설명 능력, 시각 인식 능력, 시각 이해 능력, 시각 스토리텔링 능력, 다중 이미지 분석 능력 등 5가지 주요 모달을 포괄하는 다양하고 광범위한 데이터 세트를 구축합니다.
+
+- **기본 설명 능력, Basic Descriptive Ability** 이미지 설명에는 단순 설명과 상세 설명을 포함하여 이미지에 포함된 정보를 설명하는 모델의 능력이 포함됩니다. 단순 설명은 일반적으로 이미지의 주요 주제와 동작을 설명하는 짧은 문구로 상세 설명은 이미지 장면, 속성 및 관계에 대한 보다 심층적인 정보를 제공합니다.
+
+- **시각적 인식 능력, Visual Recognition Ability** 이미지 인식은 이미지 내의 사물이나 장면을 인식하고 관련 정보를 추론하는 작업입니다. 이 영역은 속성 QA, 영화/TV 인식, 예술 인식, 랜드마크 인식, 유명인 인식, 감정 인식, 텍스트 인식, 사물 인식, 구조물 내용 인식 등 여러 하위 작업으로 세분화할 수 있습니다. 
+
+- **시각적 이해 능력, Visual Comprehension Ability** 이미지 이해에는 이미지의 의미와 관련 작업을 이해하는 모델의 능력이 포함됩니다. 이 영역에는 스타일 감상, 추상적 이미지 이해, 밈 이해, 이미지 분석, 차트 분석, 일반적인 문제 해결, 추론 QA와 같은 여러 하위 작업이 포함됩니다.
+
+- **시각적 스토리텔링 능력, Visual Storytelling Ability** 시각적 스토리텔링 능력은 이메일, 시, 스토리, 광고/상품 추천, 브레인스토밍 등 시각적 콘텐츠를 기반으로 문학적 창작을 하는 과정입니다. 
+
+- **다중 이미지 분석 능력, Multi-Image Analysis Ability** 다중 이미지 분석은 여러 이미지를 분석하고 비교하는 작업입니다. 이 영역에는 두 개/여러 개의 이미지 비교, 여러 이미지 정보 요약, 상품 비교, 이미지의 단계별 분석 등의 작업이 포함됩니다.
+
+
+
+<p align="center">
+    <img src="../assets/touchstone_datasets.jpg" width="600"/>
+<p>
+
+5가지 측면에서 모델의 능력을 종합적으로 평가합니다. 위 그림과 같이 27개의 하위 과제를 예로 들었습니다. 지각부터 인지, 창의력까지 난이도가 높아질수록 모델에 대한 요구 사항도 점점 더 높아지고 있습니다. 현재 LVLM 기능은 초기 단계에 있습니다. 데이터 세트에는 800개 이상의 질문과 27개 카테고리가 포함되어 있습니다.
+
+## Methods
+
+당사는 자동화된 평가를 위해 강력한 LLM을 심사자로 적용합니다. 이미지의 내용을 효과적으로 이해하기 위해 실제 이미지 입력을 세분화된 텍스트 주석으로 수동으로 대체합니다. 이러한 주석과 해당 질문을 GPT4와 같은 강력한 LLM에 입력하면 참조 답변을 얻을 수 있습니다.
+
+LVLM의 평가를 위해 실제 이미지와 질문을 입력으로 제공하고 각각의 답변을 얻습니다. 마지막으로, 세분화된 주석과 질문을 기반으로 LVLM이 생성한 답변에 GPT4를 사용하여 점수를 매깁니다. 채점 지침에 따라 모델은 주석을 이미지의 콘텐츠로 간주하여 답변의 유용성, 관련성 및 정확성을 평가해야 합니다. 평가의 공정성을 보장하기 위해 각 모델의 답변은 GPT4의 일관된 참조 답변과 비교됩니다. 모든 문제에서 모델의 평균 점수가 최종 점수로 사용됩니다.
+
+답안 위치의 영향을 제거하기 위해 답안 위치를 바꿔서 두 번째 채점 라운드를 수행한 다음 얻은 두 점수의 평균을 계산합니다. 이 접근 방식은 답안 배치로 인해 발생하는 편향을 완화하는 것을 목표로 합니다.
+
+<p align="center">
+    <img src="../assets/touchstone_eval.png" width="600"/>
+<p>
+
+### Evaluation
+
+#### Evaluation in English-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| PandaGPT      | 488.5 |
+| MiniGPT4      | 531.7 |
+| InstructBLIP  | 552.4 |
+| LLaMA-AdapterV2 | 590.1 |
+| mPLUG-Owl     | 605.4 |
+| LLaVA         | 602.7 |
+| Qwen-VL-Chat   | 645.2 |
+
+#### Evaluation in Chinese-based Multimodal Dialogue
+
+| Model         | Score |
+|---------------|-------|
+| VisualGLM     | 247.1 |
+| Qwen-VL-Chat   | 401.2 |
+