Commit 4bd96acc authored by lvzhen's avatar lvzhen
Browse files

Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py,...

Update tools_using_demo/cli_demo_tool.py, tools_using_demo/openai_api_demo.py, tools_using_demo/README.md, tools_using_demo/README_en.md, tools_using_demo/tool_register.py, tensorrt_llm_demo/README.md, tensorrt_llm_demo/tensorrt_llm_cli_demo.py, resources/cli-demo.png, resources/web-demo2.png, resources/tool_en.png, resources/tool.png, resources/heart.png, resources/wechat.jpg, resources/web-demo.gif, resources/web-demo2.gif, resources/WECHAT.md, resources/code_en.gif, openai_api_demo/api_server.py, openai_api_demo/.env, openai_api_demo/openai_api_request.py, openai_api_demo/docker-compose.yml, openai_api_demo/utils.py, openai_api_demo/zhipu_api_request.py, openai_api_demo/langchain_openai_api.py, langchain_demo/ChatGLM3.py, langchain_demo/main.py, langchain_demo/tools/Calculator.py, langchain_demo/tools/DistanceConversion.py, langchain_demo/tools/Weather.py, Intel_device_demo/README.md, Intel_device_demo/ipex_llm_cpu_demo/api_server.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py, Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py, Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py, Intel_device_demo/ipex_llm_cpu_demo/generate.py, Intel_device_demo/ipex_llm_cpu_demo/utils.py, Intel_device_demo/openvino_demo/openvino_cli_demo.py, Intel_device_demo/openvino_demo/README.md, finetune_demo/lora_finetune.ipynb, finetune_demo/finetune_hf.py, finetune_demo/inference_hf.py, finetune_demo/README.md, finetune_demo/README_en.md, finetune_demo/requirements.txt, finetune_demo/configs/ds_zero_3.json, finetune_demo/configs/ds_zero_2.json, finetune_demo/configs/ptuning_v2.yaml, finetune_demo/configs/lora.yaml, finetune_demo/configs/sft.yaml, composite_demo/assets/emojis.png, composite_demo/assets/demo.png, composite_demo/assets/heart.png, composite_demo/assets/tool.png, composite_demo/.streamlit/config.toml, composite_demo/client.py, composite_demo/conversation.py, composite_demo/README_en.md, composite_demo/main.py, composite_demo/demo_chat.py, composite_demo/README.md, composite_demo/requirements.txt, composite_demo/demo_tool.py, composite_demo/tool_registry.py, composite_demo/demo_ci.py, basic_demo/cli_demo_bad_word_ids.py, basic_demo/cli_demo.py, basic_demo/cli_batch_request_demo.py, basic_demo/web_demo_gradio.py, basic_demo/web_demo_streamlit.py, .github/ISSUE_TEMPLATE/bug_report.yaml, .github/ISSUE_TEMPLATE/feature-request.yaml, .github/PULL_REQUEST_TEMPLATE/pr_template.md, MODEL_LICENSE, .gitignore, DEPLOYMENT.md, DEPLOYMENT_en.md, LICENSE, PROMPT.md, README_en.md, requirements.txt, README.md, PROMPT_en.md, update_requirements.sh files
parent d0572507
"""
This script is designed for interacting with a local GLM3 AI model using the `ChatGLM3` class
from the `langchain_community` library. It facilitates continuous dialogue with the GLM3 model.
1. Start the Local Model Service: Before running this script, you need to execute the `api_server.py` script
to start the GLM3 model's service.
2. Run the Script: The script includes functionality for initializing the LLMChain object and obtaining AI responses,
allowing the user to input questions and receive AI answers.
3. This demo is not support for streaming.
"""
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
from langchain_community.llms.chatglm3 import ChatGLM3
def initialize_llm_chain(messages: list):
template = "{input}"
prompt = PromptTemplate.from_template(template)
endpoint_url = "http://127.0.0.1:8000/v1/chat/completions"
llm = ChatGLM3(
endpoint_url=endpoint_url,
max_tokens=4096,
prefix_messages=messages,
top_p=0.9
)
return LLMChain(prompt=prompt, llm=llm)
def get_ai_response(llm_chain, user_message):
ai_response = llm_chain.invoke({"input": user_message})
return ai_response
def continuous_conversation():
messages = [
SystemMessage(content="You are an intelligent AI assistant, named ChatGLM3."),
]
while True:
user_input = input("Human (or 'exit' to quit): ")
if user_input.lower() == 'exit':
break
llm_chain = initialize_llm_chain(messages=messages)
ai_response = get_ai_response(llm_chain, user_input)
print("ChatGLM3: ", ai_response["text"])
messages += [
HumanMessage(content=user_input),
AIMessage(content=ai_response["text"]),
]
if __name__ == "__main__":
continuous_conversation()
"""
This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model.
It includes functions to:
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests
and handle responses.
"""
from openai import OpenAI
base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url)
def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def simple_chat(use_stream=True):
messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's "
"instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你用生动的话语给我讲一个小故事吧"
}
]
response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
presence_penalty=1.1,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["你好,给我讲一个故事,大概100字"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))
if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()
import gc
import json
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers.generation.logits_process import LogitsProcessor
from typing import Union, Tuple
class InvalidScoreLogitsProcessor(LogitsProcessor):
def __call__(
self, input_ids: torch.LongTensor, scores: torch.FloatTensor
) -> torch.FloatTensor:
if torch.isnan(scores).any() or torch.isinf(scores).any():
scores.zero_()
scores[..., 5] = 5e4
return scores
def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
content = ""
for response in output.split("<|assistant|>"):
metadata, content = response.split("\n", maxsplit=1)
if not metadata.strip():
content = content.strip()
content = content.replace("[[训练时间]]", "2023年")
else:
if use_tool:
content = "\n".join(content.split("\n")[1:-1])
def tool_call(**kwargs):
return kwargs
parameters = eval(content)
content = {
"name": metadata.strip(),
"arguments": json.dumps(parameters, ensure_ascii=False)
}
else:
content = {
"name": metadata.strip(),
"content": content
}
return content
@torch.inference_mode()
def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
messages = params["messages"]
tools = params["tools"]
temperature = float(params.get("temperature", 1.0))
repetition_penalty = float(params.get("repetition_penalty", 1.0))
top_p = float(params.get("top_p", 1.0))
max_new_tokens = int(params.get("max_tokens", 256))
echo = params.get("echo", True)
messages = process_chatglm_messages(messages, tools=tools)
query, role = messages[-1]["content"], messages[-1]["role"]
inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role)
inputs = inputs.to(model.device)
input_echo_len = len(inputs["input_ids"][0])
if input_echo_len >= model.config.seq_length:
print(f"Input length larger than {model.config.seq_length}")
eos_token_id = [
tokenizer.eos_token_id,
tokenizer.get_command("<|user|>"),
tokenizer.get_command("<|observation|>")
]
gen_kwargs = {
"max_new_tokens": max_new_tokens,
"do_sample": True if temperature > 1e-5 else False,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"logits_processor": [InvalidScoreLogitsProcessor()],
}
if temperature > 1e-5:
gen_kwargs["temperature"] = temperature
total_len = 0
for total_ids in model.stream_generate(**inputs, eos_token_id=eos_token_id, **gen_kwargs):
total_ids = total_ids.tolist()[0]
total_len = len(total_ids)
if echo:
output_ids = total_ids[:-1]
else:
output_ids = total_ids[input_echo_len:-1]
response = tokenizer.decode(output_ids)
if response and response[-1] != "�":
response, stop_found = apply_stopping_strings(response, ["<|observation|>"])
yield {
"text": response,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
"finish_reason": "function_call" if stop_found else None,
}
if stop_found:
break
# Only last stream result contains finish_reason, we set finish_reason as stop
ret = {
"text": response,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
"finish_reason": "stop",
}
yield ret
gc.collect()
torch.cuda.empty_cache()
def process_chatglm_messages(messages, tools=None):
_messages = messages
messages = []
msg_has_sys = False
if tools:
messages.append(
{
"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools
}
)
msg_has_sys = True
for m in _messages:
role, content, func_call = m.role, m.content, m.function_call
if role == "function":
messages.append(
{
"role": "observation",
"content": content
}
)
elif role == "assistant" and func_call is not None:
for response in content.split("<|assistant|>"):
metadata, sub_content = response.split("\n", maxsplit=1)
messages.append(
{
"role": role,
"metadata": metadata,
"content": sub_content.strip()
}
)
else:
if role == "system" and msg_has_sys:
msg_has_sys = False
continue
messages.append({"role": role, "content": content})
return messages
def generate_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
for response in generate_stream_chatglm3(model, tokenizer, params):
pass
return response
def apply_stopping_strings(reply, stop_strings) -> Tuple[str, bool]:
stop_found = False
for string in stop_strings:
idx = reply.find(string)
if idx != -1:
reply = reply[:idx]
stop_found = True
break
if not stop_found:
# If something like "\nYo" is generated just before "\nYou: is completed, trim it
for string in stop_strings:
for j in range(len(string) - 1, 0, -1):
if reply[-j:] == string[:j]:
reply = reply[:-j]
break
else:
continue
break
return reply, stop_found
"""
This script is an example of using the Zhipu API to create various interactions with a ChatGLM3 model. It includes
functions to:
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
Each function demonstrates a different aspect of the API's capabilities,
showcasing how to make requests and handle responses.
Note: Make sure your Zhipu API key is set as an environment
variable formate as xxx.xxx (just for check, not need a real key).
"""
from zhipuai import ZhipuAI
base_url = "http://127.0.0.1:8000/v1/"
client = ZhipuAI(api_key="EMP.TY", base_url=base_url)
def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
response = client.chat.completions.create(
model="chatglm3_6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def simple_chat(use_stream=True):
messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
"the user's instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你介绍一下chatglm3-6b这个模型"
}
]
response = client.chat.completions.create(
model="chatglm3_",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["ChatGLM3-6B 是一个大型的中英双语模型。"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))
if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()
# basic requirements
protobuf>=4.25.3
transformers>=4.39.3
tokenizers>=0.15.0
cpm_kernels>=1.0.11
torch>=2.1.0
gradio>=4.26.0
sentencepiece>=0.2.0
sentence_transformers>=2.4.0
accelerate>=0.29.2
streamlit>=1.33.0
fastapi>=0.110.0
loguru~=0.7.2
mdtex2html>=1.3.0
latex2mathml>=3.77.0
jupyter_client>=8.6.1
nltk
# for openai demo
#openai>=1.17.1
#zhipuai>=2.0.1
#pydantic>=2.7.0
#sse-starlette>=2.0.0
#uvicorn>=0.29.0
#timm>=0.9.16
#tiktoken>=0.6.0
# for langchain demo
#langchain>=0.1.16
#langchainhub>=0.1.15
#arxiv>=2.1.0
<div align="center">
<img src=wechat.jpg width="60%"/>
<p> 扫码关注公众号,加入「ChatGLM交流群」 </p>
<p> Scan the QR code to follow the official account and join the "ChatGLM Discussion Group" </p>
</div>
# 使用NVIDIA TensorRT-LLM部署ChatGLM3
[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架,您可以按照以下步骤来使用TensorRT-LLM部署ChatGLM3模型。
## 1. 安装TensorRT-LLM
#### 获取TensorRT-LLM代码:
```bash
# TensorRT-LLM 代码需要使用 git-lfs 拉取
apt-get update && apt-get -y install git git-lfs
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
# 本流程将使用 v0.7.0 Release 版本
git checkout tags/v0.7.0 -b release/0.7.0
git submodule update --init --recursive
git lfs install
git lfs pull
```
#### 构建docker镜像并安装TensorRT-LLM:
```bash
make -C docker release_build
```
#### 运行docker镜像:
```bash
make -C docker release_run
```
## 3. 为ChatGLM3模型构建TensorRT-LLM推理引擎:
#### 安装Python依赖:
```bash
cd ./examples/chatglm
pip install -r requirements.txt
apt-get update
apt-get install git-lfs
```
#### 从Huggingface下载ChatGLM3模型:
```
# 您可以选择具体想部署的模型下载
git clone https://huggingface.co/THUDM/chatglm3-6b chatglm3_6b
git clone https://huggingface.co/THUDM/chatglm3-6b-base chatglm3_6b_base
git clone https://huggingface.co/THUDM/chatglm3-6b-32k chatglm3_6b_32k
```
#### 使用build.py来构建推理引擎:
以下是一些使用build.py构建推理引擎的示例:
```bash
# 构建一个默认的精度为fp16的引擎
python3 build.py -m chatglm3_6b --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
# 构建一个默认的精度为fp16的引擎,并打开FMHA功能(详见下文)
python3 build.py -m chatglm3_6b --enable_context_fmha --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
# 构建一个w8a16的引擎
python3 build.py -m chatglm3_6b --use_weight_only --output_dir trt_engines/chatglm3_6b/weight_only/1-gpu
# 构建一个默认的精度为fp16的引擎,并支持使用两个GPU
python3 build.py -m chatglm3_6b --world_size 2 --output_dir trt_engines/chatglm3_6b/fp16/2-gpu
# 使用chatglm3_6b_base模型
python3 build.py -m chatglm3_6b_base --output_dir trt_engines/chatglm3_6b_base/fp16/1-gpu
# 使用chatglm3_6b-32k模型
python3 build.py -m chatglm3_6b_32k --output_dir trt_engines/chatglm3_6b-32k/fp16/1-gpu
```
#### 可配置的plugin参数
* 使用 `--use_gpt_attention_plugin <DataType>` 来配置 GPT Attention plugin (默认使用float16)。
* 使用 `--use_gemm_plugin <DataType>` 来配置 GEMM plugin (默认使用float16)。
* 使用 `--use_rmsnorm_plugin <DataType>` 来配置 RMS normolization plugin (默认使用float16)。
#### Fused Multi-Head Attention (FMHA)
* 使用 `--enable_context_fmha``--enable_context_fmha_fp32_acc` 参数来开启FMHA kernels, 可以获得更好的性能的同时降低显存开销。
* `--use_gpt_attention_plugin` 如果被设置为关闭的话将无法使用FMHA功能。
* `--enable_context_fmha` 将会使用FP16 accumulator, 可能会略微降低精度. 您也可以选择使用`--enable_context_fmha_fp32_acc` 来保护精度,但这会略微降低FMHA的性能提升。
#### Weight-Only 量化
* 使用 `--use_weight_only` 来开启 Weight-Only 量化, 这样可以加速推理并减少显存开销。
* 你还可以通过切换 `--weight_only_precision int8` 或者 `--weight_only_precision int4` 来选择具体是使用int8还是int4量化,默认为Int8。
#### In-flight Batching(须使用NVIDIA Triton进行推理)
* 使用 `--use_inflight_batching` 来开启 In-flight Batching,启用后,Paged KV Cache也会自动启用。
* Paged KV cache中block的数量可以用`--tokens_per_block` 来配置。
更多详细的功能和配置请参考:[TensorRT-LLM ChatGLM实现](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/chatglm)
## 3. 使用TensorRT-LLM Python Runtime进行推理
#### 单机单卡的推理示例:
```bash
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
#### 单机多卡的推理示例:
```bash
mpirun -n 2 \
python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
* 如果您以root权限运行 `mpirun`,则可能需要添加 `--allow-run-as-root` 参数。
#### 运行summarize.py进行文章总结任务:
```bash
python3 ../summarize.py --test_trt_llm \
--hf_model_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
#### 运行我们提供的对话demo:[tensorrt_llm_cli_demo.py](tensorrt_llm_cli_demo.py):
```bash
python3 tensorrt_llm_cli_demo.py --tokenizer_dir chatglm3_6b --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
运行结果展示:
```
用户: what is your name?
ChatGLM3-6B:Hello, I am an assistant named ChatGLM3-6B, and you can call me assistant. What can I help you with??
用户: what is new in ChatGLM3-6B compared with ChatGLM2-6B?
ChatGLM3-6B:ChatGLM3-6B is an improved version of ChatGLM2-6B. Compared with ChatGLM2-6B, ChatGLM3-6B has the following improvements:
1. Enhanced language understanding capabilities: ChatGLM3-6B's language model is based on the GLM3-6B model, which has been pre-trained on more diverse and large-scale data, resulting in better language understanding and generation capabilities.
2. Improved generation ability: ChatGLM3-6B has improved the generation ability compared to ChatGLM2-6B. With more training data and optimization algorithms, ChatGLM3-6B can generate more coherent and natural-looking text.
3. Enhanced adaptability to different dialogue scenarios: ChatGLM3-6B has been trained on more diverse dialogue data, including dialogue scenarios with different languages, cultures, and styles, making it more adaptable to different dialogue scenarios.
4. New features and functions: ChatGLM3-6B also has some new features and functions, such as support for multiple choice questions, sentiment analysis, and entity recognition.
In short, ChatGLM3-6B is more advanced and capable than ChatGLM2-6B, and can better meet the needs of users in various scenarios..
```
#### 性能测试:
您可以在[这里](https://github.com/NVIDIA/TensorRT-LLM/tree/main/benchmarks/python)查阅到如何测试 TensorRT-LLM 上运行 ChatGLM3 的性能。
## 4. 使用NVIDIA Triton部署在线推理服务器
使用 NVIDIA Triton 可以部署高性能,高拓展性,高稳定性的推理服务,并且可以开启In-flight Batching功能提升实际推理服务时的吞吐。详见[In-flight Batching Triton Backend](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main/inflight_batcher_llm)
\ No newline at end of file
"""
This script is a part of a larger project for generating text using large language models.
It includes functionalities for finding engine files, parsing arguments, setting up configurations for different models,
and executing the generation process with various settings.
This script particularly supports models like ChatGLM3-6B and its variants,
handling quantization, serialization, and runtime aspects.
Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Modifications made by Yuxuan.Zhang @ ZhipuAI on 2023-12-24.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Modifications:
1. Removed input_file, tokenizer_type, and other parameters unrelated to dialogue. Set num_beams to 1.
2. Adapted single turn dialogue into ChatGLM3-6B template and implemented multi-turn conversations.
"""
import argparse
import json
import torch
import transformers
from pathlib import Path
from typing import List
import tensorrt_llm
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime import (GenerationSession, ModelConfig, SamplingConfig)
def find_engines(dir: Path, model_name: str = "*", dtype: str = "*", tp_size: str = "*", rank: str = "*") -> List[Path]:
"""
Searches for engine files matching a specified pattern within a directory.
This is typically used to locate compiled model files for efficient execution on specific hardware.
Parameters:
- dir: The directory to search.
- model_name, dtype, tp_size, rank:
Pattern matching parameters to filter engine files by model name, data type,
tensor parallel size, and rank respectively.
Returns:
- A list of Paths pointing to the engine files.
"""
template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine"
return list(dir.glob(template))
def parse_arguments(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--model_name',
type=str,
choices=[
"chatglm3_6b",
"chatglm3_6b_base",
"chatglm3_6b_32k"
],
default="chatglm3_6b",
help='the name of the model')
parser.add_argument('--max_output_len', type=int, default=4096)
parser.add_argument('--engine_dir', type=str, default=None)
parser.add_argument('--tokenizer_dir', type=str, default=None)
parser.add_argument('--temperature', type=float, default=0.95)
parser.add_argument('--top_k', type=int, default=1)
parser.add_argument('--top_p', type=float, default=0.8)
parser.add_argument('--random_seed', type=int, default=2023)
parser.add_argument('--streaming', default=True, action='store_true')
args = parser.parse_args(args)
return args
def main():
"""
The main execution function of the script. It orchestrates the text generation process
by performing several key steps:
- Parses command-line arguments to configure model details, output specifications,
and other user-defined parameters.
- Loads the model configuration from a specified directory and prepares the environment for text generation
based on the model and hardware specifics.
- Sets up the generation session with the appropriate model, tokenizer, and runtime configurations.
- Enters a loop to continuously accept user input, generate text based on the provided prompts, and output
the model's responses.
- Handles special commands such as 'stop' to end the conversation and 'clear' to reset the chat history.
- Manages resources and ensures that the generated text is properly formatted and presented to the user.
The function is designed to be the entry point of the script, invoking all necessary components and managing the
flow of data and control throughout the execution.
"""
args = parse_arguments()
config_path = Path(args.engine_dir) / 'config.json'
with open(config_path, 'r') as f:
config = json.load(f)
dtype = config['builder_config']['precision']
max_output_len = min(config['builder_config']['max_output_len'], args.max_output_len)
use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
remove_input_padding = config['builder_config']['remove_input_padding']
tp_size = config['builder_config']['tensor_parallel']
pp_size = config['builder_config']['pipeline_parallel']
world_size = tp_size * pp_size
assert world_size == tensorrt_llm.mpi_world_size(), f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
max_output_len = min(max_output_len, args.max_output_len)
runtime_rank = tensorrt_llm.mpi_rank()
runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=world_size)
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
serialize_path = find_engines(
dir=Path(args.engine_dir),
model_name=args.model_name,
dtype=dtype,
tp_size=world_size,
rank=runtime_rank)[0]
tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_dir, trust_remote_code=True)
model_config = ModelConfig(vocab_size=config['builder_config']['vocab_size'],
num_layers=config['builder_config']['num_layers'],
num_heads=config['builder_config']['num_heads'] // tp_size,
num_kv_heads=(config['builder_config']['num_kv_heads'] + tp_size - 1) // tp_size,
hidden_size=config['builder_config']['hidden_size'] // tp_size,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=config['builder_config']['remove_input_padding'],
model_name=args.model_name,
paged_kv_cache=config['builder_config']['paged_kv_cache'],
quant_mode=QuantMode(config['builder_config']['quant_mode']),
dtype=dtype)
sampling_config = SamplingConfig(
end_id=tokenizer.eos_token_id,
pad_id=tokenizer.pad_token_id,
num_beams=1,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p
)
sampling_config.random_seed = args.random_seed
with open(serialize_path, 'rb') as f:
engine_buffer = f.read()
session = GenerationSession
decoder = session(model_config, engine_buffer, runtime_mapping)
history = []
while True:
input_text_with_history = ""
max_input_len = config['builder_config']['max_input_len']
input_text = input("用户: ")
if input_text.lower() == 'stop':
break
if input_text.lower() == 'clear':
history = []
print("ChatGLM3-6B: 对话历史已清空")
continue
history.append(input_text)
for idx, content in enumerate(history):
if idx % 2 != 0:
input_text_with_history += "{}\n".format(content)
else:
input_text_with_history += "<|user|>{}\n<|assistant|>".format(content)
tokenized = tokenizer(
input_text_with_history,
return_tensors="pt",
padding=True,
return_length=True
)
input_ids = tokenized['input_ids'].int()
input_lengths = tokenized['length'].int()
max_input_len_real = torch.max(input_lengths)
if max_input_len_real > max_input_len:
input_ids = input_ids[:, :max_input_len]
input_lengths = torch.where(input_lengths > max_input_len, max_input_len, input_lengths)
else:
max_input_len = max_input_len_real
if remove_input_padding:
input_ids_no_padding = (torch.zeros(1, torch.sum(input_lengths), dtype=torch.int32))
lengths_acc = torch.cumsum(torch.cat([torch.IntTensor([0]), input_lengths]), dim=0)
for i in range(len(input_ids)):
input_ids_no_padding[0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor(
input_ids[i, max_input_len - input_lengths[i]:max_input_len])
input_ids = input_ids_no_padding
elif use_gpt_attention_plugin:
input_ids_padding_right = torch.zeros_like(input_ids) + sampling_config.end_id
for i, sample in enumerate(input_ids):
nPadding = 0
for token in sample:
if token == sampling_config.pad_id:
nPadding += 1
else:
break
input_ids_padding_right[i, :len(sample[nPadding:])] = sample[nPadding:]
input_ids = input_ids_padding_right
input_lengths = torch.tensor([input_ids.shape[-1]], dtype=torch.int32)
decoder.setup(1, max_input_len, max_output_len, 1)
output = decoder.decode(
input_ids.contiguous().cuda(),
input_lengths.contiguous().cuda(),
sampling_config,
output_sequence_lengths=True,
return_dict=True,
streaming=args.streaming
)
print("ChatGLM3-6B:", end="")
generated_text = ""
if args.streaming:
for output_item in output:
output_id = output_item["output_ids"]
output_sequence_lengths = output_item["sequence_lengths"]
output_id = output_id[0, 0, output_sequence_lengths[0, 0] - 1]
output_word = tokenizer.convert_ids_to_tokens(int(output_id))
output_word = output_word.replace("▁", " ")
output_word = tokenizer.convert_tokens_to_string(output_word)
print(output_word, end="", flush=True)
generated_text += output_word
print("\n")
else:
torch.cuda.synchronize()
output_ids = output["output_ids"][0]
output = output_ids[0, input_lengths.item():]
generated_text = tokenizer.decode(output, skip_special_tokens=True)
print(generated_text)
history.append(generated_text)
del decoder
print(f"Good bye!")
if __name__ == '__main__':
main()
# 工具调用
本文档将介绍如何使用 ChatGLM3-6B 进行工具调用。目前只有 ChatGLM3-6B 模型支持工具调用,而 ChatGLM3-6B-Base 和 ChatGLM3-6B-32K 模型不支持。
## 构建 System Prompt
这里以两个工具调用为例,首先准备好要构建的数据的描述信息。
```python
tools = [
{
"name": "track",
"description": "追踪指定股票的实时价格",
"parameters": {
"type": "object",
"properties": {
"symbol": {
"description": "需要追踪的股票代码"
}
},
"required": ['symbol']
}
},
{
"name": "text-to-speech",
"description": "将文本转换为语音",
"parameters": {
"type": "object",
"properties": {
"text": {
"description": "需要转换成语音的文本"
},
"voice": {
"description": "要使用的语音类型(男声、女声等)"
},
"speed": {
"description": "语音的速度(快、中等、慢等)"
}
},
"required": ['text']
}
}
]
system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
```
请确保工具的定义格式与例子中一致以获得最优的性能
## 提出问题
注意:目前 ChatGLM3-6B 的工具调用只支持通过 `chat` 方法,不支持 `stream_chat` 方法。
```python
history = [system_info]
query = "帮我查询股票10111的价格"
response, history = model.chat(tokenizer, query, history=history)
print(response)
```
这里期望得到的输出为
```json
{"name": "track", "parameters": {"symbol": "10111"}}
```
这表示模型需要调用工具 `track`,并且需要传入参数 `symbol`
## 调用工具,生成回复
这里需要自行实现调用工具的逻辑。假设已经得到了返回结果,将结果以 json 格式返回给模型并得到回复。
```python
result = json.dumps({"price": 12412}, ensure_ascii=False)
response, history = model.chat(tokenizer, result, history=history, role="observation")
print(response)
```
这里 `role="observation"` 表示输入的是工具调用的返回值而不是用户输入,不能省略。
期望得到的输出为
```
根据您的查询,经过API的调用,股票10111的价格是12412。
```
这表示本次工具调用已经结束,模型根据返回结果生成回复。对于比较复杂的问题,模型可能需要进行多次工具调用。这时,可以根据返回的 `response``str` 还是 `dict` 来判断返回的是生成的回复还是工具调用请求。
\ No newline at end of file
# Tool Invocation
This document will introduce how to use the ChatGLM3-6B for tool invocation. Currently, only the ChatGLM3-6B model supports tool invocation, while the ChatGLM3-6B-Base and ChatGLM3-6B-32K models do not support it.
## Building System Prompt
Here are two examples of tool invocation. First, prepare the description information of the data to be built.
```python
tools = [
{
"name": "track",
"description": "Track the real-time price of a specified stock",
"parameters": {
"type": "object",
"properties": {
"symbol": {
"description": "The stock code that needs to be tracked"
}
},
"required": ['symbol']
}
},
{
"name": "text-to-speech",
"description": "Convert text to speech",
"parameters": {
"type": "object",
"properties": {
"text": {
"description": "The text that needs to be converted into speech"
},
"voice": {
"description": "The type of voice to use (male, female, etc.)"
},
"speed": {
"description": "The speed of the speech (fast, medium, slow, etc.)"
}
},
"required": ['text']
}
}
]
system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
```
Please ensure that the definition format of the tool is consistent with the example to obtain optimal performance.
## Asking Questions
Note: Currently, the tool invocation of ChatGLM3-6B only supports the `chat` method and does not support the `stream_chat` method.
```python
history = [system_info]
query = "Help me inquire the price of stock 10111"
response, history = model.chat(tokenizer, query, history=history)
print(response)
```
The expected output here is
```json
{"name": "track", "parameters": {"symbol": "10111"}}
```
This indicates that the model needs to call the tool `track`, and the parameter `symbol` needs to be passed in.
## Invoke Tool, Generate Response
Here, you need to implement the logic of calling the tool yourself. Assuming that the return result has been obtained, return the result to the model in json format and get a response.
```python
result = json.dumps({"price": 12412}, ensure_ascii=False)
response, history = model.chat(tokenizer, result, history=history, role="observation")
print(response)
```
Here `role="observation"` indicates that the input is the return value of the tool invocation rather than user input, and it cannot be omitted.
The expected output is
```
Based on your query, after the API call, the price of stock 10111 is 12412.
```
This indicates that this tool invocation has ended, and the model generates a response based on the return result. For more complex questions, the model may need to make multiple tool invocations. At this time, you can judge whether the returned `response` is `str` or `dict` to determine whether the return is a generated response or a tool invocation request.
\ No newline at end of file
"""
This demo script is designed for interacting with the ChatGLM3-6B in Function, to show Function Call capabilities.
"""
import os
import platform
import torch
from transformers import AutoTokenizer, AutoModel
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
stop_stream = False
def build_prompt(history):
prompt = "欢迎使用 ChatGLM3-6B 模型,输入内容即可进行对话,clear 清空对话历史,stop 终止程序"
for query, response in history:
prompt += f"\n\n用户:{query}"
prompt += f"\n\nChatGLM3-6B:{response}"
return prompt
tools = [
{'name': 'track', 'description': '追踪指定股票的实时价格',
'parameters':
{
'type': 'object', 'properties':
{'symbol':
{
'description': '需要追踪的股票代码'
}
},
'required': []
}
}, {
'name': '/text-to-speech', 'description': '将文本转换为语音',
'parameters':
{
'type': 'object', 'properties':
{
'text':
{
'description': '需要转换成语音的文本'
},
'voice':
{
'description': '要使用的语音类型(男声、女声等)'
},
'speed': {
'description': '语音的速度(快、中等、慢等)'
}
}, 'required': []
}
},
{
'name': '/image_resizer', 'description': '调整图片的大小和尺寸',
'parameters': {'type': 'object',
'properties':
{
'image_file':
{
'description': '需要调整大小的图片文件'
},
'width':
{
'description': '需要调整的宽度值'
},
'height':
{
'description': '需要调整的高度值'
}
},
'required': []
}
},
{
'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片',
'parameters': {
'type': 'object', 'properties':
{
'food_name':
{
'description': '需要生成图片的食品名称'
}
},
'required': []
}
}
]
system_item = {
"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools
}
def main():
past_key_values, history = None, [system_item]
role = "user"
global stop_stream
print("欢迎使用 ChatGLM3-6B 模型,输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
while True:
query = input("\n用户:") if role == "user" else input("\n结果:")
if query.strip() == "stop":
break
if query.strip() == "clear":
past_key_values, history = None, [system_item]
role = "user"
os.system(clear_command)
print("欢迎使用 ChatGLM3-6B 模型,输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
continue
print("\nChatGLM:", end="")
response, history = model.chat(tokenizer, query, history=history, role=role)
print(response, end="", flush=True)
print("")
if isinstance(response, dict):
role = "observation"
else:
role = "user"
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment