Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py,...

Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py, tools_using_demo/README.md, tools_using_demo/README_en.md, tools_using_demo/tool_register.py, tensorrt_llm_demo/README.md, tensorrt_llm_demo/tensorrt_llm_cli_demo.py, resources/cli-demo.png, resources/web-demo2.png, resources/tool_en.png, resources/tool.png, resources/heart.png, resources/wechat.jpg, resources/web-demo.gif, resources/web-demo2.gif, resources/WECHAT.md, resources/code_en.gif, openai_api_demo/api_server.py, openai_api_demo/.env, openai_api_demo/openai_api_request.py, openai_api_demo/docker-compose.yml, openai_api_demo/utils.py, openai_api_demo/zhipu_api_request.py, openai_api_demo/langchain_openai_api.py, langchain_demo/ChatGLM3.py, langchain_demo/main.py, langchain_demo/tools/Calculator.py, langchain_demo/tools/DistanceConversion.py, langchain_demo/tools/Weather.py, Intel_device_demo/README.md, Intel_device_demo/ipex_llm_cpu_demo/api_server.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py, Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py, Intel_device_demo/ipex_llm_cpu_demo/generate.py, Intel_device_demo/ipex_llm_cpu_demo/utils.py, Intel_device_demo/openvino_demo/openvino_cli_demo.py, Intel_device_demo/openvino_demo/README.md, finetune_demo/lora_finetune.ipynb, finetune_demo/finetune_hf.py, finetune_demo/inference_hf.py, finetune_demo/README.md, finetune_demo/README_en.md, finetune_demo/requirements.txt, finetune_demo/configs/ds_zero_3.json, finetune_demo/configs/ds_zero_2.json, finetune_demo/configs/ptuning_v2.yaml, finetune_demo/configs/lora.yaml, finetune_demo/configs/sft.yaml, composite_demo/assets/emojis.png, composite_demo/assets/demo.png, composite_demo/assets/heart.png, composite_demo/assets/tool.png, composite_demo/.streamlit/config.toml, composite_demo/client.py, composite_demo/conversation.py, composite_demo/README_en.md, composite_demo/main.py, composite_demo/demo_chat.py, composite_demo/README.md, composite_demo/requirements.txt, composite_demo/demo_tool.py, composite_demo/tool_registry.py, composite_demo/demo_ci.py, basic_demo/cli_demo_bad_word_ids.py, basic_demo/cli_demo.py, basic_demo/cli_batch_request_demo.py, basic_demo/web_demo_gradio.py, basic_demo/web_demo_streamlit.py, .github/ISSUE_TEMPLATE/bug_report.yaml, .github/ISSUE_TEMPLATE/feature-request.yaml, .github/PULL_REQUEST_TEMPLATE/pr_template.md, MODEL_LICENSE, .gitignore, DEPLOYMENT.md, DEPLOYMENT_en.md, LICENSE, PROMPT.md, README_en.md, requirements.txt, README.md, PROMPT_en.md, update_requirements.sh files

Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py,...
Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py, tools_using_demo/README.md, tools_using_demo/README_en.md, tools_using_demo/tool_register.py, tensorrt_llm_demo/README.md, tensorrt_llm_demo/tensorrt_llm_cli_demo.py, resources/cli-demo.png, resources/web-demo2.png, resources/tool_en.png, resources/tool.png, resources/heart.png, resources/wechat.jpg, resources/web-demo.gif, resources/web-demo2.gif, resources/WECHAT.md, resources/code_en.gif, openai_api_demo/api_server.py, openai_api_demo/.env, openai_api_demo/openai_api_request.py, openai_api_demo/docker-compose.yml, openai_api_demo/utils.py, openai_api_demo/zhipu_api_request.py, openai_api_demo/langchain_openai_api.py, langchain_demo/ChatGLM3.py, langchain_demo/main.py, langchain_demo/tools/Calculator.py, langchain_demo/tools/DistanceConversion.py, langchain_demo/tools/Weather.py, Intel_device_demo/README.md, Intel_device_demo/ipex_llm_cpu_demo/api_server.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py, Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py, Intel_device_demo/ipex_llm_cpu_demo/generate.py, Intel_device_demo/ipex_llm_cpu_demo/utils.py, Intel_device_demo/openvino_demo/openvino_cli_demo.py, Intel_device_demo/openvino_demo/README.md, finetune_demo/lora_finetune.ipynb, finetune_demo/finetune_hf.py, finetune_demo/inference_hf.py, finetune_demo/README.md, finetune_demo/README_en.md, finetune_demo/requirements.txt, finetune_demo/configs/ds_zero_3.json, finetune_demo/configs/ds_zero_2.json, finetune_demo/configs/ptuning_v2.yaml, finetune_demo/configs/lora.yaml, finetune_demo/configs/sft.yaml, composite_demo/assets/emojis.png, composite_demo/assets/demo.png, composite_demo/assets/heart.png, composite_demo/assets/tool.png, composite_demo/.streamlit/config.toml, composite_demo/client.py, composite_demo/conversation.py, composite_demo/README_en.md, composite_demo/main.py, composite_demo/demo_chat.py, composite_demo/README.md, composite_demo/requirements.txt, composite_demo/demo_tool.py, composite_demo/tool_registry.py, composite_demo/demo_ci.py, basic_demo/cli_demo_bad_word_ids.py, basic_demo/cli_demo.py, basic_demo/cli_batch_request_demo.py, basic_demo/web_demo_gradio.py, basic_demo/web_demo_streamlit.py, .github/ISSUE_TEMPLATE/bug_report.yaml, .github/ISSUE_TEMPLATE/feature-request.yaml, .github/PULL_REQUEST_TEMPLATE/pr_template.md, MODEL_LICENSE, .gitignore, DEPLOYMENT.md, DEPLOYMENT_en.md, LICENSE, PROMPT.md, README_en.md, requirements.txt, README.md, PROMPT_en.md, update_requirements.sh files
4bd96acc · lvzhen · d0572507 · 4bd96acc · 4bd96acc · 4bd96acc
Commit 4bd96acc authored May 10, 2024 by lvzhen
20 changed files
--- a/openai_api_demo/langchain_openai_api.py
+++ b/openai_api_demo/langchain_openai_api.py
+"""
+This script is designed for interacting with a local GLM3 AI model using the `ChatGLM3` class
+from the `langchain_community` library. It facilitates continuous dialogue with the GLM3 model.
+
+1. Start the Local Model Service: Before running this script, you need to execute the `api_server.py` script
+to start the GLM3 model's service.
+2. Run the Script: The script includes functionality for initializing the LLMChain object and obtaining AI responses,
+allowing the user to input questions and receive AI answers.
+3. This demo is not support for streaming.
+
+"""
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
+from langchain_community.llms.chatglm3 import ChatGLM3
+
+
+def initialize_llm_chain(messages: list):
+    template = "{input}"
+    prompt = PromptTemplate.from_template(template)
+
+    endpoint_url = "http://127.0.0.1:8000/v1/chat/completions"
+    llm = ChatGLM3(
+        endpoint_url=endpoint_url,
+        max_tokens=4096,
+        prefix_messages=messages,
+        top_p=0.9
+    )
+    return LLMChain(prompt=prompt, llm=llm)
+
+
+def get_ai_response(llm_chain, user_message):
+    ai_response = llm_chain.invoke({"input": user_message})
+    return ai_response
+
+
+def continuous_conversation():
+    messages = [
+        SystemMessage(content="You are an intelligent AI assistant, named ChatGLM3."),
+    ]
+    while True:
+        user_input = input("Human (or 'exit' to quit): ")
+        if user_input.lower() == 'exit':
+            break
+        llm_chain = initialize_llm_chain(messages=messages)
+        ai_response = get_ai_response(llm_chain, user_input)
+        print("ChatGLM3: ", ai_response["text"])
+        messages += [
+            HumanMessage(content=user_input),
+            AIMessage(content=ai_response["text"]),
+        ]
+
+
+if __name__ == "__main__":
+    continuous_conversation()
--- a/openai_api_demo/openai_api_request.py
+++ b/openai_api_demo/openai_api_request.py
+"""
+This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model.
+It includes functions to:
+
+1. Conduct a basic chat session, asking about weather conditions in multiple cities.
+2. Initiate a simple chat in Chinese, asking the model to tell a short story.
+3. Retrieve and print embeddings for a given text input.
+
+Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests
+and handle responses.
+"""
+
+from openai import OpenAI
+
+base_url = "http://127.0.0.1:8000/v1/"
+client = OpenAI(api_key="EMPTY", base_url=base_url)
+
+
+def function_chat():
+    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    response = client.chat.completions.create(
+        model="chatglm3-6b",
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    if response:
+        content = response.choices[0].message.content
+        print(content)
+    else:
+        print("Error:", response.status_code)
+
+
+def simple_chat(use_stream=True):
+    messages = [
+        {
+            "role": "system",
+            "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's "
+                       "instructions carefully. Respond using markdown.",
+        },
+        {
+            "role": "user",
+            "content": "你好，请你用生动的话语给我讲一个小故事吧"
+        }
+    ]
+    response = client.chat.completions.create(
+        model="chatglm3-6b",
+        messages=messages,
+        stream=use_stream,
+        max_tokens=256,
+        temperature=0.8,
+        presence_penalty=1.1,
+        top_p=0.8)
+    if response:
+        if use_stream:
+            for chunk in response:
+                print(chunk.choices[0].delta.content)
+        else:
+            content = response.choices[0].message.content
+            print(content)
+    else:
+        print("Error:", response.status_code)
+
+
+def embedding():
+    response = client.embeddings.create(
+        model="bge-large-zh-1.5",
+        input=["你好，给我讲一个故事，大概100字"],
+    )
+    embeddings = response.data[0].embedding
+    print("嵌入完成，维度：", len(embeddings))
+
+
+if __name__ == "__main__":
+    simple_chat(use_stream=False)
+    simple_chat(use_stream=True)
+    embedding()
+    function_chat()
--- a/openai_api_demo/utils.py
+++ b/openai_api_demo/utils.py
+import gc
+import json
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizer
+from transformers.generation.logits_process import LogitsProcessor
+from typing import Union, Tuple
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
+    content = ""
+    for response in output.split("<|assistant|>"):
+        metadata, content = response.split("\n", maxsplit=1)
+        if not metadata.strip():
+            content = content.strip()
+            content = content.replace("[[训练时间]]", "2023年")
+        else:
+            if use_tool:
+                content = "\n".join(content.split("\n")[1:-1])
+
+                def tool_call(**kwargs):
+                    return kwargs
+
+                parameters = eval(content)
+                content = {
+                    "name": metadata.strip(),
+                    "arguments": json.dumps(parameters, ensure_ascii=False)
+                }
+            else:
+                content = {
+                    "name": metadata.strip(),
+                    "content": content
+                }
+    return content
+
+
+@torch.inference_mode()
+def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    messages = params["messages"]
+    tools = params["tools"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_tokens", 256))
+    echo = params.get("echo", True)
+    messages = process_chatglm_messages(messages, tools=tools)
+    query, role = messages[-1]["content"], messages[-1]["role"]
+
+    inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role)
+    inputs = inputs.to(model.device)
+    input_echo_len = len(inputs["input_ids"][0])
+
+    if input_echo_len >= model.config.seq_length:
+        print(f"Input length larger than {model.config.seq_length}")
+
+    eos_token_id = [
+        tokenizer.eos_token_id,
+        tokenizer.get_command("<|user|>"),
+        tokenizer.get_command("<|observation|>")
+    ]
+
+    gen_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p,
+        "repetition_penalty": repetition_penalty,
+        "logits_processor": [InvalidScoreLogitsProcessor()],
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+
+    total_len = 0
+    for total_ids in model.stream_generate(**inputs, eos_token_id=eos_token_id, **gen_kwargs):
+        total_ids = total_ids.tolist()[0]
+        total_len = len(total_ids)
+        if echo:
+            output_ids = total_ids[:-1]
+        else:
+            output_ids = total_ids[input_echo_len:-1]
+
+        response = tokenizer.decode(output_ids)
+        if response and response[-1] != "�":
+            response, stop_found = apply_stopping_strings(response, ["<|observation|>"])
+
+            yield {
+                "text": response,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": total_len - input_echo_len,
+                    "total_tokens": total_len,
+                },
+                "finish_reason": "function_call" if stop_found else None,
+            }
+
+            if stop_found:
+                break
+
+    # Only last stream result contains finish_reason, we set finish_reason as stop
+    ret = {
+        "text": response,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+        "finish_reason": "stop",
+    }
+    yield ret
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def process_chatglm_messages(messages, tools=None):
+    _messages = messages
+    messages = []
+    msg_has_sys = False
+    if tools:
+        messages.append(
+            {
+                "role": "system",
+                "content": "Answer the following questions as best as you can. You have access to the following tools:",
+                "tools": tools
+            }
+        )
+        msg_has_sys = True
+
+    for m in _messages:
+        role, content, func_call = m.role, m.content, m.function_call
+        if role == "function":
+            messages.append(
+                {
+                    "role": "observation",
+                    "content": content
+                }
+            )
+
+        elif role == "assistant" and func_call is not None:
+            for response in content.split("<|assistant|>"):
+                metadata, sub_content = response.split("\n", maxsplit=1)
+                messages.append(
+                    {
+                        "role": role,
+                        "metadata": metadata,
+                        "content": sub_content.strip()
+                    }
+                )
+        else:
+            if role == "system" and msg_has_sys:
+                msg_has_sys = False
+                continue
+            messages.append({"role": role, "content": content})
+    return messages
+
+
+def generate_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    for response in generate_stream_chatglm3(model, tokenizer, params):
+        pass
+    return response
+
+
+def apply_stopping_strings(reply, stop_strings) -> Tuple[str, bool]:
+    stop_found = False
+    for string in stop_strings:
+        idx = reply.find(string)
+        if idx != -1:
+            reply = reply[:idx]
+            stop_found = True
+            break
+
+    if not stop_found:
+        # If something like "\nYo" is generated just before "\nYou: is completed, trim it
+        for string in stop_strings:
+            for j in range(len(string) - 1, 0, -1):
+                if reply[-j:] == string[:j]:
+                    reply = reply[:-j]
+                    break
+            else:
+                continue
+
+            break
+
+    return reply, stop_found
--- a/openai_api_demo/zhipu_api_request.py
+++ b/openai_api_demo/zhipu_api_request.py
+"""
+This script is an example of using the Zhipu API to create various interactions with a ChatGLM3 model. It includes
+functions to:
+
+1. Conduct a basic chat session, asking about weather conditions in multiple cities.
+2. Initiate a simple chat in Chinese, asking the model to tell a short story.
+3. Retrieve and print embeddings for a given text input.
+Each function demonstrates a different aspect of the API's capabilities,
+showcasing how to make requests and handle responses.
+
+Note: Make sure your Zhipu API key is set as an environment
+variable formate as xxx.xxx (just for check, not need a real key).
+"""
+
+from zhipuai import ZhipuAI
+
+base_url = "http://127.0.0.1:8000/v1/"
+client = ZhipuAI(api_key="EMP.TY", base_url=base_url)
+
+
+def function_chat():
+    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+
+    response = client.chat.completions.create(
+        model="chatglm3_6b",
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    if response:
+        content = response.choices[0].message.content
+        print(content)
+    else:
+        print("Error:", response.status_code)
+
+
+def simple_chat(use_stream=True):
+    messages = [
+        {
+            "role": "system",
+            "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
+                       "the user's instructions carefully. Respond using markdown.",
+        },
+        {
+            "role": "user",
+            "content": "你好，请你介绍一下chatglm3-6b这个模型"
+        }
+    ]
+    response = client.chat.completions.create(
+        model="chatglm3_",
+        messages=messages,
+        stream=use_stream,
+        max_tokens=256,
+        temperature=0.8,
+        top_p=0.8)
+    if response:
+        if use_stream:
+            for chunk in response:
+                print(chunk.choices[0].delta.content)
+        else:
+            content = response.choices[0].message.content
+            print(content)
+    else:
+        print("Error:", response.status_code)
+
+
+def embedding():
+    response = client.embeddings.create(
+        model="bge-large-zh-1.5",
+        input=["ChatGLM3-6B 是一个大型的中英双语模型。"],
+    )
+    embeddings = response.data[0].embedding
+    print("嵌入完成，维度：", len(embeddings))
+
+
+if __name__ == "__main__":
+    simple_chat(use_stream=False)
+    simple_chat(use_stream=True)
+    embedding()
+    function_chat()
--- a/requirements.txt
+++ b/requirements.txt
+# basic requirements
+
+protobuf>=4.25.3
+transformers>=4.39.3
+tokenizers>=0.15.0
+cpm_kernels>=1.0.11
+torch>=2.1.0
+gradio>=4.26.0
+sentencepiece>=0.2.0
+sentence_transformers>=2.4.0
+accelerate>=0.29.2
+streamlit>=1.33.0
+fastapi>=0.110.0
+loguru~=0.7.2
+mdtex2html>=1.3.0
+latex2mathml>=3.77.0
+jupyter_client>=8.6.1
+nltk
+
+# for openai demo
+#openai>=1.17.1
+#zhipuai>=2.0.1
+#pydantic>=2.7.0
+#sse-starlette>=2.0.0
+#uvicorn>=0.29.0
+#timm>=0.9.16
+#tiktoken>=0.6.0
+
+# for langchain demo
+
+#langchain>=0.1.16
+#langchainhub>=0.1.15
+#arxiv>=2.1.0
--- a/resources/WECHAT.md
+++ b/resources/WECHAT.md
+<div align="center">
+<img src=wechat.jpg width="60%"/>
+
+<p> 扫码关注公众号，加入「ChatGLM交流群」 </p>
+<p> Scan the QR code to follow the official account and join the "ChatGLM Discussion Group" </p>
+</div>
+
--- a/resources/cli-demo.png
+++ b/resources/cli-demo.png
--- a/resources/code_en.gif
+++ b/resources/code_en.gif
--- a/resources/heart.png
+++ b/resources/heart.png
--- a/resources/tool.png
+++ b/resources/tool.png
--- a/resources/tool_en.png
+++ b/resources/tool_en.png
--- a/resources/web-demo.gif
+++ b/resources/web-demo.gif
--- a/resources/web-demo2.gif
+++ b/resources/web-demo2.gif
--- a/resources/web-demo2.png
+++ b/resources/web-demo2.png
--- a/resources/wechat.jpg
+++ b/resources/wechat.jpg
--- a/tensorrt_llm_demo/README.md
+++ b/tensorrt_llm_demo/README.md
+# 使用NVIDIA TensorRT-LLM部署ChatGLM3
+
+[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架，您可以按照以下步骤来使用TensorRT-LLM部署ChatGLM3模型。
+
+## 1. 安装TensorRT-LLM
+#### 获取TensorRT-LLM代码：
+
+```bash
+# TensorRT-LLM 代码需要使用 git-lfs 拉取
+apt-get update && apt-get -y install git git-lfs
+
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+cd TensorRT-LLM
+
+# 本流程将使用 v0.7.0 Release 版本
+git checkout tags/v0.7.0 -b release/0.7.0
+git submodule update --init --recursive
+git lfs install
+git lfs pull
+```
+
+#### 构建docker镜像并安装TensorRT-LLM：
+```bash
+make -C docker release_build
+```
+
+#### 运行docker镜像：
+```bash
+make -C docker release_run
+```
+
+## 3. 为ChatGLM3模型构建TensorRT-LLM推理引擎：
+
+#### 安装Python依赖：
+```bash
+cd ./examples/chatglm
+pip install -r requirements.txt
+apt-get update
+apt-get install git-lfs
+```
+#### 从Huggingface下载ChatGLM3模型：
+```
+# 您可以选择具体想部署的模型下载
+git clone https://huggingface.co/THUDM/chatglm3-6b      chatglm3_6b
+git clone https://huggingface.co/THUDM/chatglm3-6b-base chatglm3_6b_base
+git clone https://huggingface.co/THUDM/chatglm3-6b-32k  chatglm3_6b_32k
+```
+
+#### 使用build.py来构建推理引擎：
+以下是一些使用build.py构建推理引擎的示例：
+```bash
+# 构建一个默认的精度为fp16的引擎
+python3 build.py -m chatglm3_6b --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
+
+# 构建一个默认的精度为fp16的引擎，并打开FMHA功能（详见下文）
+python3 build.py -m chatglm3_6b --enable_context_fmha --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
+
+# 构建一个w8a16的引擎
+python3 build.py -m chatglm3_6b --use_weight_only --output_dir trt_engines/chatglm3_6b/weight_only/1-gpu
+
+# 构建一个默认的精度为fp16的引擎，并支持使用两个GPU
+python3 build.py -m chatglm3_6b --world_size 2 --output_dir trt_engines/chatglm3_6b/fp16/2-gpu
+
+# 使用chatglm3_6b_base模型
+python3 build.py -m chatglm3_6b_base --output_dir trt_engines/chatglm3_6b_base/fp16/1-gpu
+
+# 使用chatglm3_6b-32k模型
+python3 build.py -m chatglm3_6b_32k --output_dir trt_engines/chatglm3_6b-32k/fp16/1-gpu
+```
+
+#### 可配置的plugin参数
+
+* 使用 `--use_gpt_attention_plugin <DataType>` 来配置 GPT Attention plugin (默认使用float16)。
+* 使用 `--use_gemm_plugin <DataType>` 来配置 GEMM plugin (默认使用float16)。
+* 使用 `--use_rmsnorm_plugin <DataType>` 来配置 RMS normolization plugin (默认使用float16)。
+
+#### Fused Multi-Head Attention (FMHA)
+
+* 使用 `--enable_context_fmha` 或 `--enable_context_fmha_fp32_acc` 参数来开启FMHA kernels, 可以获得更好的性能的同时降低显存开销。
+
+* `--use_gpt_attention_plugin` 如果被设置为关闭的话将无法使用FMHA功能。
+
+* `--enable_context_fmha` 将会使用FP16 accumulator, 可能会略微降低精度. 您也可以选择使用`--enable_context_fmha_fp32_acc` 来保护精度，但这会略微降低FMHA的性能提升。
+
+#### Weight-Only 量化
+
+* 使用 `--use_weight_only` 来开启 Weight-Only 量化, 这样可以加速推理并减少显存开销。
+
+* 你还可以通过切换 `--weight_only_precision int8` 或者 `--weight_only_precision int4` 来选择具体是使用int8还是int4量化，默认为Int8。
+
+#### In-flight Batching（须使用NVIDIA Triton进行推理）
+
+* 使用 `--use_inflight_batching` 来开启 In-flight Batching，启用后，Paged KV Cache也会自动启用。
+
+* Paged KV cache中block的数量可以用`--tokens_per_block` 来配置。
+
+更多详细的功能和配置请参考：[TensorRT-LLM ChatGLM实现](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/chatglm)。
+
+## 3. 使用TensorRT-LLM Python Runtime进行推理
+
+#### 单机单卡的推理示例：
+
+```bash
+python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
+                  --max_output_len 50 \
+                  --tokenizer_dir chatglm3_6b \
+                  --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
+```
+
+#### 单机多卡的推理示例：
+
+```bash
+mpirun -n 2 \
+    python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
+                     --max_output_len 50 \
+                     --tokenizer_dir chatglm3_6b \
+                     --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
+```
+
+* 如果您以root权限运行 `mpirun`，则可能需要添加 `--allow-run-as-root` 参数。
+
+#### 运行summarize.py进行文章总结任务：
+
+```bash
+python3 ../summarize.py --test_trt_llm \
+                        --hf_model_dir chatglm3_6b \
+                        --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
+```
+
+#### 运行我们提供的对话demo:[tensorrt_llm_cli_demo.py](tensorrt_llm_cli_demo.py):
+
+```bash
+python3 tensorrt_llm_cli_demo.py --tokenizer_dir chatglm3_6b --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
+```
+
+运行结果展示：
+```
+用户: what is your name?
+ChatGLM3-6B:Hello, I am an assistant named ChatGLM3-6B, and you can call me assistant. What can I help you with??
+
+用户: what is new in ChatGLM3-6B compared with ChatGLM2-6B?
+ChatGLM3-6B:ChatGLM3-6B is an improved version of ChatGLM2-6B. Compared with ChatGLM2-6B, ChatGLM3-6B has the following improvements:
+
+1. Enhanced language understanding capabilities: ChatGLM3-6B's language model is based on the GLM3-6B model, which has been pre-trained on more diverse and large-scale data, resulting in better language understanding and generation capabilities.
+
+2. Improved generation ability: ChatGLM3-6B has improved the generation ability compared to ChatGLM2-6B. With more training data and optimization algorithms, ChatGLM3-6B can generate more coherent and natural-looking text.
+
+3. Enhanced adaptability to different dialogue scenarios: ChatGLM3-6B has been trained on more diverse dialogue data, including dialogue scenarios with different languages, cultures, and styles, making it more adaptable to different dialogue scenarios.
+
+4. New features and functions: ChatGLM3-6B also has some new features and functions, such as support for multiple choice questions, sentiment analysis, and entity recognition.
+
+In short, ChatGLM3-6B is more advanced and capable than ChatGLM2-6B, and can better meet the needs of users in various scenarios..
+```
+
+#### 性能测试：
+
+您可以在[这里](https://github.com/NVIDIA/TensorRT-LLM/tree/main/benchmarks/python)查阅到如何测试 TensorRT-LLM 上运行 ChatGLM3 的性能。
+
+## 4. 使用NVIDIA Triton部署在线推理服务器
+使用 NVIDIA Triton 可以部署高性能，高拓展性，高稳定性的推理服务，并且可以开启In-flight Batching功能提升实际推理服务时的吞吐。详见[In-flight Batching Triton Backend](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main/inflight_batcher_llm)。
\ No newline at end of file
--- a/tensorrt_llm_demo/tensorrt_llm_cli_demo.py
+++ b/tensorrt_llm_demo/tensorrt_llm_cli_demo.py
+"""
+This script is a part of a larger project for generating text using large language models.
+It includes functionalities for finding engine files, parsing arguments, setting up configurations for different models,
+and executing the generation process with various settings.
+This script particularly supports models like ChatGLM3-6B and its variants,
+handling quantization, serialization, and runtime aspects.
+
+
+Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Modifications made by Yuxuan.Zhang @ ZhipuAI on 2023-12-24.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Modifications:
+
+1. Removed input_file, tokenizer_type, and other parameters unrelated to dialogue. Set num_beams to 1.
+2. Adapted single turn dialogue into ChatGLM3-6B template and implemented multi-turn conversations.
+
+"""
+
+import argparse
+import json
+import torch
+import transformers
+
+from pathlib import Path
+from typing import List
+
+import tensorrt_llm
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime import (GenerationSession, ModelConfig, SamplingConfig)
+
+
+def find_engines(dir: Path, model_name: str = "*", dtype: str = "*", tp_size: str = "*", rank: str = "*") -> List[Path]:
+    """
+    Searches for engine files matching a specified pattern within a directory.
+    This is typically used to locate compiled model files for efficient execution on specific hardware.
+    Parameters:
+        - dir: The directory to search.
+        - model_name, dtype, tp_size, rank:
+        Pattern matching parameters to filter engine files by model name, data type,
+        tensor parallel size, and rank respectively.
+    Returns:
+        - A list of Paths pointing to the engine files.
+    """
+
+    template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine"
+    return list(dir.glob(template))
+
+
+def parse_arguments(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name',
+                        type=str,
+                        choices=[
+                            "chatglm3_6b",
+                            "chatglm3_6b_base",
+                            "chatglm3_6b_32k"
+                        ],
+                        default="chatglm3_6b",
+                        help='the name of the model')
+    parser.add_argument('--max_output_len', type=int, default=4096)
+    parser.add_argument('--engine_dir', type=str, default=None)
+    parser.add_argument('--tokenizer_dir', type=str, default=None)
+    parser.add_argument('--temperature', type=float, default=0.95)
+    parser.add_argument('--top_k', type=int, default=1)
+    parser.add_argument('--top_p', type=float, default=0.8)
+    parser.add_argument('--random_seed', type=int, default=2023)
+    parser.add_argument('--streaming', default=True, action='store_true')
+    args = parser.parse_args(args)
+
+    return args
+
+
+def main():
+    """
+    The main execution function of the script. It orchestrates the text generation process
+    by performing several key steps:
+        - Parses command-line arguments to configure model details, output specifications,
+        and other user-defined parameters.
+        - Loads the model configuration from a specified directory and prepares the environment for text generation
+        based on the model and hardware specifics.
+        - Sets up the generation session with the appropriate model, tokenizer, and runtime configurations.
+        - Enters a loop to continuously accept user input, generate text based on the provided prompts, and output
+        the model's responses.
+        - Handles special commands such as 'stop' to end the conversation and 'clear' to reset the chat history.
+        - Manages resources and ensures that the generated text is properly formatted and presented to the user.
+    The function is designed to be the entry point of the script, invoking all necessary components and managing the
+    flow of data and control throughout the execution.
+    """
+
+    args = parse_arguments()
+
+    config_path = Path(args.engine_dir) / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+
+    dtype = config['builder_config']['precision']
+    max_output_len = min(config['builder_config']['max_output_len'], args.max_output_len)
+    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
+    remove_input_padding = config['builder_config']['remove_input_padding']
+    tp_size = config['builder_config']['tensor_parallel']
+    pp_size = config['builder_config']['pipeline_parallel']
+    world_size = tp_size * pp_size
+
+    assert world_size == tensorrt_llm.mpi_world_size(), f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+
+    max_output_len = min(max_output_len, args.max_output_len)
+    runtime_rank = tensorrt_llm.mpi_rank()
+    runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=world_size)
+    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+    serialize_path = find_engines(
+        dir=Path(args.engine_dir),
+        model_name=args.model_name,
+        dtype=dtype,
+        tp_size=world_size,
+        rank=runtime_rank)[0]
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_dir, trust_remote_code=True)
+    model_config = ModelConfig(vocab_size=config['builder_config']['vocab_size'],
+                               num_layers=config['builder_config']['num_layers'],
+                               num_heads=config['builder_config']['num_heads'] // tp_size,
+                               num_kv_heads=(config['builder_config']['num_kv_heads'] + tp_size - 1) // tp_size,
+                               hidden_size=config['builder_config']['hidden_size'] // tp_size,
+                               gpt_attention_plugin=use_gpt_attention_plugin,
+                               remove_input_padding=config['builder_config']['remove_input_padding'],
+                               model_name=args.model_name,
+                               paged_kv_cache=config['builder_config']['paged_kv_cache'],
+                               quant_mode=QuantMode(config['builder_config']['quant_mode']),
+                               dtype=dtype)
+
+    sampling_config = SamplingConfig(
+        end_id=tokenizer.eos_token_id,
+        pad_id=tokenizer.pad_token_id,
+        num_beams=1,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p
+    )
+    sampling_config.random_seed = args.random_seed
+
+    with open(serialize_path, 'rb') as f:
+        engine_buffer = f.read()
+        session = GenerationSession
+
+    decoder = session(model_config, engine_buffer, runtime_mapping)
+
+    history = []
+    while True:
+        input_text_with_history = ""
+        max_input_len = config['builder_config']['max_input_len']
+        input_text = input("用户: ")
+        if input_text.lower() == 'stop':
+            break
+
+        if input_text.lower() == 'clear':
+            history = []
+            print("ChatGLM3-6B: 对话历史已清空")
+            continue
+
+        history.append(input_text)
+
+        for idx, content in enumerate(history):
+            if idx % 2 != 0:
+                input_text_with_history += "{}\n".format(content)
+            else:
+                input_text_with_history += "<|user|>{}\n<|assistant|>".format(content)
+
+        tokenized = tokenizer(
+            input_text_with_history,
+            return_tensors="pt",
+            padding=True,
+            return_length=True
+        )
+
+        input_ids = tokenized['input_ids'].int()
+        input_lengths = tokenized['length'].int()
+        max_input_len_real = torch.max(input_lengths)
+        if max_input_len_real > max_input_len:
+            input_ids = input_ids[:, :max_input_len]
+            input_lengths = torch.where(input_lengths > max_input_len, max_input_len, input_lengths)
+        else:
+            max_input_len = max_input_len_real
+        if remove_input_padding:
+            input_ids_no_padding = (torch.zeros(1, torch.sum(input_lengths), dtype=torch.int32))
+
+            lengths_acc = torch.cumsum(torch.cat([torch.IntTensor([0]), input_lengths]), dim=0)
+
+            for i in range(len(input_ids)):
+                input_ids_no_padding[0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor(
+                    input_ids[i, max_input_len - input_lengths[i]:max_input_len])
+
+            input_ids = input_ids_no_padding
+
+        elif use_gpt_attention_plugin:
+            input_ids_padding_right = torch.zeros_like(input_ids) + sampling_config.end_id
+            for i, sample in enumerate(input_ids):
+                nPadding = 0
+                for token in sample:
+                    if token == sampling_config.pad_id:
+                        nPadding += 1
+                    else:
+                        break
+                input_ids_padding_right[i, :len(sample[nPadding:])] = sample[nPadding:]
+            input_ids = input_ids_padding_right
+        input_lengths = torch.tensor([input_ids.shape[-1]], dtype=torch.int32)
+        decoder.setup(1, max_input_len, max_output_len, 1)
+        output = decoder.decode(
+            input_ids.contiguous().cuda(),
+            input_lengths.contiguous().cuda(),
+            sampling_config,
+            output_sequence_lengths=True,
+            return_dict=True,
+            streaming=args.streaming
+        )
+
+        print("ChatGLM3-6B:", end="")
+        generated_text = ""
+        if args.streaming:
+            for output_item in output:
+                output_id = output_item["output_ids"]
+                output_sequence_lengths = output_item["sequence_lengths"]
+                output_id = output_id[0, 0, output_sequence_lengths[0, 0] - 1]
+                output_word = tokenizer.convert_ids_to_tokens(int(output_id))
+                output_word = output_word.replace("▁", " ")
+                output_word = tokenizer.convert_tokens_to_string(output_word)
+                print(output_word, end="", flush=True)
+                generated_text += output_word
+            print("\n")
+        else:
+            torch.cuda.synchronize()
+            output_ids = output["output_ids"][0]
+            output = output_ids[0, input_lengths.item():]
+            generated_text = tokenizer.decode(output, skip_special_tokens=True)
+            print(generated_text)
+
+        history.append(generated_text)
+
+    del decoder
+    print(f"Good bye!")
+
+
+if __name__ == '__main__':
+    main()
--- a/tools_using_demo/README.md
+++ b/tools_using_demo/README.md
+# 工具调用
+本文档将介绍如何使用 ChatGLM3-6B 进行工具调用。目前只有 ChatGLM3-6B 模型支持工具调用，而 ChatGLM3-6B-Base 和 ChatGLM3-6B-32K 模型不支持。
+
+## 构建 System Prompt
+这里以两个工具调用为例，首先准备好要构建的数据的描述信息。
+
+```python
+tools = [
+    {
+        "name": "track",
+        "description": "追踪指定股票的实时价格",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "symbol": {
+                    "description": "需要追踪的股票代码"
+                }
+            },
+            "required": ['symbol']
+        }
+    },
+    {
+        "name": "text-to-speech",
+        "description": "将文本转换为语音",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "text": {
+                    "description": "需要转换成语音的文本"
+                },
+                "voice": {
+                    "description": "要使用的语音类型（男声、女声等）"
+                },
+                "speed": {
+                    "description": "语音的速度（快、中等、慢等）"
+                }
+            },
+            "required": ['text']
+        }
+    }
+]
+system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
+```
+请确保工具的定义格式与例子中一致以获得最优的性能
+
+## 提出问题
+注意：目前 ChatGLM3-6B 的工具调用只支持通过 `chat` 方法，不支持  `stream_chat` 方法。
+```python
+history = [system_info]
+query = "帮我查询股票10111的价格"
+response, history = model.chat(tokenizer, query, history=history)
+print(response)
+```
+这里期望得到的输出为
+```json
+{"name": "track", "parameters": {"symbol": "10111"}}
+```
+这表示模型需要调用工具 `track`，并且需要传入参数 `symbol`。
+
+## 调用工具，生成回复
+这里需要自行实现调用工具的逻辑。假设已经得到了返回结果，将结果以 json 格式返回给模型并得到回复。
+```python
+result = json.dumps({"price": 12412}, ensure_ascii=False)
+response, history = model.chat(tokenizer, result, history=history, role="observation")
+print(response)
+```
+这里 `role="observation"` 表示输入的是工具调用的返回值而不是用户输入，不能省略。
+
+期望得到的输出为
+```
+根据您的查询，经过API的调用，股票10111的价格是12412。
+```
+
+这表示本次工具调用已经结束，模型根据返回结果生成回复。对于比较复杂的问题，模型可能需要进行多次工具调用。这时，可以根据返回的 `response` 是 `str` 还是 `dict` 来判断返回的是生成的回复还是工具调用请求。
\ No newline at end of file
--- a/tools_using_demo/README_en.md
+++ b/tools_using_demo/README_en.md
+# Tool Invocation
+This document will introduce how to use the ChatGLM3-6B for tool invocation. Currently, only the ChatGLM3-6B model supports tool invocation, while the ChatGLM3-6B-Base and ChatGLM3-6B-32K models do not support it.
+
+## Building System Prompt
+Here are two examples of tool invocation. First, prepare the description information of the data to be built.
+
+```python
+tools = [
+    {
+        "name": "track",
+        "description": "Track the real-time price of a specified stock",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "symbol": {
+                    "description": "The stock code that needs to be tracked"
+                }
+            },
+            "required": ['symbol']
+        }
+    },
+    {
+        "name": "text-to-speech",
+        "description": "Convert text to speech",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "text": {
+                    "description": "The text that needs to be converted into speech"
+                },
+                "voice": {
+                    "description": "The type of voice to use (male, female, etc.)"
+                },
+                "speed": {
+                    "description": "The speed of the speech (fast, medium, slow, etc.)"
+                }
+            },
+            "required": ['text']
+        }
+    }
+]
+system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
+```
+
+Please ensure that the definition format of the tool is consistent with the example to obtain optimal performance.
+
+## Asking Questions
+Note: Currently, the tool invocation of ChatGLM3-6B only supports the `chat` method and does not support the `stream_chat` method.
+```python
+history = [system_info]
+query = "Help me inquire the price of stock 10111"
+response, history = model.chat(tokenizer, query, history=history)
+print(response)
+```
+The expected output here is
+```json
+{"name": "track", "parameters": {"symbol": "10111"}}
+```
+This indicates that the model needs to call the tool `track`, and the parameter `symbol` needs to be passed in.
+
+## Invoke Tool, Generate Response
+Here, you need to implement the logic of calling the tool yourself. Assuming that the return result has been obtained, return the result to the model in json format and get a response.
+```python
+result = json.dumps({"price": 12412}, ensure_ascii=False)
+response, history = model.chat(tokenizer, result, history=history, role="observation")
+print(response)
+```
+Here `role="observation"` indicates that the input is the return value of the tool invocation rather than user input, and it cannot be omitted.
+
+The expected output is
+```
+Based on your query, after the API call, the price of stock 10111 is 12412.
+```
+
+This indicates that this tool invocation has ended, and the model generates a response based on the return result. For more complex questions, the model may need to make multiple tool invocations. At this time, you can judge whether the returned `response` is `str` or `dict` to determine whether the return is a generated response or a tool invocation request.
\ No newline at end of file
--- a/tools_using_demo/cli_demo_tool.py
+++ b/tools_using_demo/cli_demo_tool.py
+"""
+This demo script is designed for interacting with the ChatGLM3-6B in Function, to show Function Call capabilities.
+"""
+
+import os
+import platform
+import torch
+from transformers import AutoTokenizer, AutoModel
+
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
+
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+
+
+def build_prompt(history):
+    prompt = "欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM3-6B：{response}"
+    return prompt
+
+
+tools = [
+    {'name': 'track', 'description': '追踪指定股票的实时价格',
+     'parameters':
+         {
+             'type': 'object', 'properties':
+             {'symbol':
+                 {
+                     'description': '需要追踪的股票代码'
+                 }
+             },
+             'required': []
+         }
+     }, {
+        'name': '/text-to-speech', 'description': '将文本转换为语音',
+        'parameters':
+            {
+                'type': 'object', 'properties':
+                {
+                    'text':
+                        {
+                            'description': '需要转换成语音的文本'
+                        },
+                    'voice':
+                        {
+                            'description': '要使用的语音类型（男声、女声等）'
+                        },
+                    'speed': {
+                        'description': '语音的速度（快、中等、慢等）'
+                    }
+                }, 'required': []
+            }
+    },
+    {
+        'name': '/image_resizer', 'description': '调整图片的大小和尺寸',
+        'parameters': {'type': 'object',
+                       'properties':
+                           {
+                               'image_file':
+                                   {
+                                       'description': '需要调整大小的图片文件'
+                                   },
+                               'width':
+                                   {
+                                       'description': '需要调整的宽度值'
+                                   },
+                               'height':
+                                   {
+                                       'description': '需要调整的高度值'
+                                   }
+                           },
+                       'required': []
+                       }
+    },
+    {
+        'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片',
+        'parameters': {
+            'type': 'object', 'properties':
+                {
+                    'food_name':
+                        {
+                            'description': '需要生成图片的食品名称'
+                        }
+                },
+            'required': []
+        }
+    }
+]
+system_item = {
+    "role": "system",
+    "content": "Answer the following questions as best as you can. You have access to the following tools:",
+    "tools": tools
+}
+
+
+def main():
+    past_key_values, history = None, [system_item]
+    role = "user"
+    global stop_stream
+    print("欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+    while True:
+        query = input("\n用户：") if role == "user" else input("\n结果：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            past_key_values, history = None, [system_item]
+            role = "user"
+            os.system(clear_command)
+            print("欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        print("\nChatGLM：", end="")
+        response, history = model.chat(tokenizer, query, history=history, role=role)
+        print(response, end="", flush=True)
+        print("")
+        if isinstance(response, dict):
+            role = "observation"
+        else:
+            role = "user"
+
+
+if __name__ == "__main__":
+    main()