Commit 0e1045f0 authored by lvzhen's avatar lvzhen
Browse files

Revert "Merge branch 'master' into 'master'"

This reverts merge request !2
parent 467ec853
LOCAL_MODEL_PATH=<your_path>
LOCAL_EMBEDDING_MODEL_PATH=<your_path>
\ No newline at end of file
"""
This script implements an API for the ChatGLM3-6B model,
formatted similarly to OpenAI's API (https://platform.openai.com/docs/api-reference/chat).
It's designed to be run as a web server using FastAPI and uvicorn,
making the ChatGLM3-6B model accessible through OpenAI Client.
Key Components and Features:
- Model and Tokenizer Setup: Configures the model and tokenizer paths and loads them.
- FastAPI Configuration: Sets up a FastAPI application with CORS middleware for handling cross-origin requests.
- API Endpoints:
- "/v1/models": Lists the available models, specifically ChatGLM3-6B.
- "/v1/chat/completions": Processes chat completion requests with options for streaming and regular responses.
- "/v1/embeddings": Processes Embedding request of a list of text inputs.
- Token Limit Caution: In the OpenAI API, 'max_tokens' is equivalent to HuggingFace's 'max_new_tokens', not 'max_length'.
For instance, setting 'max_tokens' to 8192 for a 6b model would result in an error due to the model's inability to output
that many tokens after accounting for the history and prompt tokens.
- Stream Handling and Custom Functions: Manages streaming responses and custom function calls within chat responses.
- Pydantic Models: Defines structured models for requests and responses, enhancing API documentation and type safety.
- Main Execution: Initializes the model and tokenizer, and starts the FastAPI app on the designated host and port.
Note:
This script doesn't include the setup for special tokens or multi-GPU support by default.
Users need to configure their special tokens and can enable multi-GPU support as per the provided instructions.
Embedding Models only support in One GPU.
Running this script requires 14-15GB of GPU memory. 2 GB for the embedding model and 12-13 GB for the FP16 ChatGLM3 LLM.
"""
import os
import time
import tiktoken
import torch
import uvicorn
import json
from fastapi import FastAPI, HTTPException, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union
from loguru import logger
from pydantic import BaseModel, Field
from transformers import AutoTokenizer, AutoModel
from utils import process_response, generate_chatglm3, generate_stream_chatglm3
from sentence_transformers import SentenceTransformer
from tools.schema import tool_class, tool_def, tool_param_start_with, tool_define_param_name
from sse_starlette.sse import EventSourceResponse
# Set up limit request time
EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
# set LLM path
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
# set Embedding Model path
EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', 'BAAI/bge-m3')
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class FunctionCallResponse(BaseModel):
name: Optional[str] = None
arguments: Optional[str] = None
class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system", "function"]
content: str = None
name: Optional[str] = None
function_call: Optional[FunctionCallResponse] = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
function_call: Optional[FunctionCallResponse] = None
## for Embedding
class EmbeddingRequest(BaseModel):
input: Union[List[str], str]
model: str
class CompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class EmbeddingResponse(BaseModel):
data: list
model: str
object: str
usage: CompletionUsage
# for ChatCompletionRequest
class UsageInfo(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = 0.8
top_p: Optional[float] = 0.8
max_tokens: Optional[int] = None
stream: Optional[bool] = False
tools: Optional[Union[dict, List[dict]]] = None
repetition_penalty: Optional[float] = 1.1
agent: Optional[bool] = False
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length", "function_call"]
class ChatCompletionResponseStreamChoice(BaseModel):
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length", "function_call"]]
index: int
class ChatCompletionResponse(BaseModel):
model: str
id: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: Optional[UsageInfo] = None
@app.get("/health")
async def health() -> Response:
"""Health check."""
return Response(status_code=200)
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def get_embeddings(request: EmbeddingRequest):
if isinstance(request.input, str):
embeddings = [embedding_model.encode(request.input)]
else:
embeddings = [embedding_model.encode(text) for text in request.input]
embeddings = [embedding.tolist() for embedding in embeddings]
def num_tokens_from_string(string: str) -> int:
"""
Returns the number of tokens in a text string.
use cl100k_base tokenizer
"""
encoding = tiktoken.get_encoding('cl100k_base')
num_tokens = len(encoding.encode(string))
return num_tokens
response = {
"data": [
{
"object": "embedding",
"embedding": embedding,
"index": index
}
for index, embedding in enumerate(embeddings)
],
"model": request.model,
"object": "list",
"usage": CompletionUsage(
prompt_tokens=sum(len(text.split()) for text in request.input),
completion_tokens=0,
total_tokens=sum(num_tokens_from_string(text) for text in request.input),
)
}
return response
@app.get("/v1/models", response_model=ModelList)
async def list_models():
model_card = ModelCard(
id="chatglm3-6b"
)
return ModelList(
data=[model_card]
)
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
global model, tokenizer
if len(request.messages) < 1 or request.messages[-1].role == "assistant":
raise HTTPException(status_code=400, detail="Invalid request")
gen_params = dict(
messages=request.messages,
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens or 1024,
echo=False,
stream=request.stream,
repetition_penalty=request.repetition_penalty,
agent=request.agent
)
logger.debug(f"==== request ====\n{gen_params}")
gen_params["tools"] = tool_def if gen_params["agent"] else []
if request.stream:
# Use the stream mode to read the first few characters, if it is not a function call, direct stram output
predict_stream_generator = predict_stream(request.model, gen_params)
output = next(predict_stream_generator)
if not contains_custom_function(output, gen_params["tools"]):
return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
# Obtain the result directly at one time and determine whether tools needs to be called.
logger.debug(f"First result output:\n{output}")
function_call = None
if output and request.tools:
try:
function_call = process_response(output, use_tool=True)
except:
logger.warning("Failed to parse tool call")
# CallFunction
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
"""
In this demo, we did not register any tools.
You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here.
Similar to the following method:
"""
if tool_param_start_with in output:
tool = tool_class.get(function_call.name)
if tool:
this_tool_define_param_name = tool_define_param_name.get(function_call.name)
if this_tool_define_param_name:
tool_param = json.loads(function_call.arguments).get(this_tool_define_param_name)
if tool().parameter_validation(tool_param):
observation = str(tool().run(tool_param))
tool_response = observation
else:
tool_response = "Tool parameter values error, please tell the user about this situation."
else:
tool_response = "Tool parameter is not defined in tools schema, please tell the user about this situation."
else:
tool_response = "No available tools found, please tell the user about this situation."
else:
tool_response = "Tool parameter content error, please tell the user about this situation."
if not gen_params.get("messages"):
gen_params["messages"] = []
gen_params["messages"].append(ChatMessage(
role="assistant",
content=output,
))
gen_params["messages"].append(ChatMessage(
role="function",
name=function_call.name,
content=tool_response,
))
# Streaming output of results after function calls
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream")
else:
# Handled to avoid exceptions in the above parsing function process.
generate = parse_output_text(request.model, output)
return EventSourceResponse(generate, media_type="text/event-stream")
# Here is the handling of stream = False
response = generate_chatglm3(model, tokenizer, gen_params)
# Remove the first newline character
if response["text"].startswith("\n"):
response["text"] = response["text"][1:]
response["text"] = response["text"].strip()
usage = UsageInfo()
function_call, finish_reason = None, "stop"
if request.tools:
try:
function_call = process_response(response["text"], use_tool=True)
except:
logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
finish_reason = "function_call"
function_call = FunctionCallResponse(**function_call)
message = ChatMessage(
role="assistant",
content=response["text"],
function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
)
logger.debug(f"==== message ====\n{message}")
choice_data = ChatCompletionResponseChoice(
index=0,
message=message,
finish_reason=finish_reason,
)
task_usage = UsageInfo.model_validate(response["usage"])
for usage_key, usage_value in task_usage.model_dump().items():
setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
return ChatCompletionResponse(
model=request.model,
id="", # for open_source model, id is empty
choices=[choice_data],
object="chat.completion",
usage=usage
)
async def predict(model_id: str, params: dict):
global model, tokenizer
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
for new_response in generate_stream_chatglm3(model, tokenizer, params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(previous_text):]
previous_text = decoded_unicode
finish_reason = new_response["finish_reason"]
if len(delta_text) == 0 and finish_reason != "function_call":
continue
function_call = None
if finish_reason == "function_call":
try:
function_call = process_response(decoded_unicode, use_tool=True)
except:
logger.warning(
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
delta = DeltaMessage(
content=delta_text,
role="assistant",
function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=delta,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def predict_stream(model_id, gen_params):
"""
The function call is compatible with stream mode output.
The first seven characters are determined.
If not a function call, the stream output is directly generated.
Otherwise, the complete character content of the function call is returned.
:param model_id:
:param gen_params:
:return:
"""
output = ""
is_function_call = False
has_send_first_chunk = False
for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(output):]
output = decoded_unicode
# When it is not a function call and the character length is> 7,
# try to judge whether it is a function call according to the special function prefix
if not is_function_call and len(output) > 7:
# Determine whether a function is called
is_function_call = contains_custom_function(output, gen_params["tools"])
if is_function_call:
continue
# Non-function call, direct stream output
finish_reason = new_response["finish_reason"]
# Send an empty string first to avoid truncation by subsequent next() operations.
if not has_send_first_chunk:
message = DeltaMessage(
content="",
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
created=int(time.time()),
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
send_msg = delta_text if has_send_first_chunk else output
has_send_first_chunk = True
message = DeltaMessage(
content=send_msg,
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
created=int(time.time()),
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
if is_function_call:
yield output
else:
yield '[DONE]'
async def parse_output_text(model_id: str, value: str):
"""
Directly output the text content of value
:param model_id:
:param value:
:return:
"""
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant", content=value),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def contains_custom_function(value: str, tools: list) -> bool:
"""
Determine whether 'function_call' according to a special function prefix.
[Note] This is not a rigorous judgment method, only for reference.
:param value:
:param tools:
:return:
"""
for tool in tools:
if value and tool["name"] in value:
return True
if __name__ == "__main__":
# Load LLM
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
# load Embedding
embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda")
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
version: "3.6"
services:
glm3_api:
image: python:3.10.13-slim
restart: unless-stopped
working_dir: /glm3
container_name: glm3_api
env_file: ./.env
networks:
- v_glm3
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
- MODEL_PATH=/models/chatglm3-6b
- EMBEDDING_PATH=/models/bge-large-zh-v1.5
- TZ=Asia/Shanghai
- PYTHONDONTWRITEBYTECODE=1
- PYTHONUNBUFFERED=1
- DOCKER=True
ports:
- 8100:8000
volumes:
- ./:/glm3
- ${LOCAL_MODEL_PATH}:/models/chatglm3-6b
- ${LOCAL_EMBEDDING_MODEL_PATH}:/models/bge-large-zh-v1.5
command:
- sh
- -c
- |
sed -i s/deb.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
sed -i s/security.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
apt-get update
python -m pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple --upgrade pip
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
python api_server.py
networks:
v_glm3:
driver: bridge
\ No newline at end of file
"""
This script is designed for interacting with a local GLM3 AI model using the `ChatGLM3` class
from the `langchain_community` library. It facilitates continuous dialogue with the GLM3 model.
1. Start the Local Model Service: Before running this script, you need to execute the `api_server.py` script
to start the GLM3 model's service.
2. Run the Script: The script includes functionality for initializing the LLMChain object and obtaining AI responses,
allowing the user to input questions and receive AI answers.
3. This demo is not support for streaming.
"""
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
from langchain_community.llms.chatglm3 import ChatGLM3
def initialize_llm_chain(messages: list):
template = "{input}"
prompt = PromptTemplate.from_template(template)
endpoint_url = "http://127.0.0.1:8000/v1/chat/completions"
llm = ChatGLM3(
endpoint_url=endpoint_url,
max_tokens=4096,
prefix_messages=messages,
top_p=0.9
)
return LLMChain(prompt=prompt, llm=llm)
def get_ai_response(llm_chain, user_message):
ai_response = llm_chain.invoke({"input": user_message})
return ai_response
def continuous_conversation():
messages = [
SystemMessage(content="You are an intelligent AI assistant, named ChatGLM3."),
]
while True:
user_input = input("Human (or 'exit' to quit): ")
if user_input.lower() == 'exit':
break
llm_chain = initialize_llm_chain(messages=messages)
ai_response = get_ai_response(llm_chain, user_input)
print("ChatGLM3: ", ai_response["text"])
messages += [
HumanMessage(content=user_input),
AIMessage(content=ai_response["text"]),
]
if __name__ == "__main__":
continuous_conversation()
"""
This script implements an API for the ChatGLM3-6B model,
formatted similarly to OpenAI's API (https://platform.openai.com/docs/api-reference/chat).
It's designed to be run as a web server using FastAPI and uvicorn,
making the ChatGLM3-6B model accessible through OpenAI Client.
Key Components and Features:
- Model and Tokenizer Setup: Configures the model and tokenizer paths and loads them.
- FastAPI Configuration: Sets up a FastAPI application with CORS middleware for handling cross-origin requests.
- API Endpoints:
- "/v1/models": Lists the available models, specifically ChatGLM3-6B.
- "/v1/chat/completions": Processes chat completion requests with options for streaming and regular responses.
- "/v1/embeddings": Processes Embedding request of a list of text inputs.
- Token Limit Caution: In the OpenAI API, 'max_tokens' is equivalent to HuggingFace's 'max_new_tokens', not 'max_length'.
For instance, setting 'max_tokens' to 8192 for a 6b model would result in an error due to the model's inability to output
that many tokens after accounting for the history and prompt tokens.
- Stream Handling and Custom Functions: Manages streaming responses and custom function calls within chat responses.
- Pydantic Models: Defines structured models for requests and responses, enhancing API documentation and type safety.
- Main Execution: Initializes the model and tokenizer, and starts the FastAPI app on the designated host and port.
Note:
This script doesn't include the setup for special tokens or multi-GPU support by default.
Users need to configure their special tokens and can enable multi-GPU support as per the provided instructions.
Embedding Models only support in One GPU.
"""
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8000/docs for documents.
# 在OpenAI的API中,max_tokens 等价于 HuggingFace 的 max_new_tokens 而不是 max_length,。
# 例如,对于6b模型,设置max_tokens = 8192,则会报错,因为扣除历史记录和提示词后,模型不能输出那么多的tokens。
import os
import time
import tiktoken
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union
import torch
import uvicorn
from fastapi import FastAPI, HTTPException, Response
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union
from loguru import logger
from pydantic import BaseModel, Field
from ipex_llm.transformers import AutoModel
from transformers import AutoTokenizer
from utils import process_response, generate_chatglm3, generate_stream_chatglm3
# from sentence_transformers import SentenceTransformer
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, AutoModel
# Set up limit request time
EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
from utils import process_response, generate_chatglm3, generate_stream_chatglm3
# set LLM path
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
# set Embedding Model path
EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', 'BAAI/bge-large-zh-v1.5')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
@asynccontextmanager
async def lifespan(app: FastAPI):
async def lifespan(app: FastAPI): # collects GPU memory
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
......@@ -108,33 +79,6 @@ class DeltaMessage(BaseModel):
function_call: Optional[FunctionCallResponse] = None
## for Embedding
class EmbeddingRequest(BaseModel):
input: List[str]
model: str
class CompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class EmbeddingResponse(BaseModel):
data: list
model: str
object: str
usage: CompletionUsage
# for ChatCompletionRequest
class UsageInfo(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
......@@ -142,7 +86,8 @@ class ChatCompletionRequest(BaseModel):
top_p: Optional[float] = 0.8
max_tokens: Optional[int] = None
stream: Optional[bool] = False
tools: Optional[Union[dict, List[dict]]] = None
functions: Optional[Union[dict, List[dict]]] = None
# Additional parameters
repetition_penalty: Optional[float] = 1.1
......@@ -153,68 +98,29 @@ class ChatCompletionResponseChoice(BaseModel):
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length", "function_call"]]
index: int
class UsageInfo(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionResponse(BaseModel):
model: str
id: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: Optional[UsageInfo] = None
@app.get("/health")
async def health() -> Response:
"""Health check."""
return Response(status_code=200)
@app.post("/v1/embeddings", response_model=EmbeddingResponse)
async def get_embeddings(request: EmbeddingRequest):
embeddings = [embedding_model.encode(text) for text in request.input]
embeddings = [embedding.tolist() for embedding in embeddings]
def num_tokens_from_string(string: str) -> int:
"""
Returns the number of tokens in a text string.
use cl100k_base tokenizer
"""
encoding = tiktoken.get_encoding('cl100k_base')
num_tokens = len(encoding.encode(string))
return num_tokens
response = {
"data": [
{
"object": "embedding",
"embedding": embedding,
"index": index
}
for index, embedding in enumerate(embeddings)
],
"model": request.model,
"object": "list",
"usage": CompletionUsage(
prompt_tokens=sum(len(text.split()) for text in request.input),
completion_tokens=0,
total_tokens=sum(num_tokens_from_string(text) for text in request.input),
)
}
return response
@app.get("/v1/models", response_model=ModelList)
async def list_models():
model_card = ModelCard(
id="chatglm3-6b"
)
return ModelList(
data=[model_card]
)
model_card = ModelCard(id="chatglm3-6b")
return ModelList(data=[model_card])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
......@@ -232,74 +138,24 @@ async def create_chat_completion(request: ChatCompletionRequest):
echo=False,
stream=request.stream,
repetition_penalty=request.repetition_penalty,
tools=request.tools,
functions=request.functions,
)
logger.debug(f"==== request ====\n{gen_params}")
if request.stream:
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream")
# Use the stream mode to read the first few characters, if it is not a function call, direct stram output
predict_stream_generator = predict_stream(request.model, gen_params)
output = next(predict_stream_generator)
if not contains_custom_function(output):
return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
# Obtain the result directly at one time and determine whether tools needs to be called.
logger.debug(f"First result output:\n{output}")
function_call = None
if output and request.tools:
try:
function_call = process_response(output, use_tool=True)
except:
logger.warning("Failed to parse tool call")
# CallFunction
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
"""
In this demo, we did not register any tools.
You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here.
Similar to the following method:
function_args = json.loads(function_call.arguments)
tool_response = dispatch_tool(tool_name: str, tool_params: dict)
"""
tool_response = ""
if not gen_params.get("messages"):
gen_params["messages"] = []
gen_params["messages"].append(ChatMessage(
role="assistant",
content=output,
))
gen_params["messages"].append(ChatMessage(
role="function",
name=function_call.name,
content=tool_response,
))
# Streaming output of results after function calls
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream")
else:
# Handled to avoid exceptions in the above parsing function process.
generate = parse_output_text(request.model, output)
return EventSourceResponse(generate, media_type="text/event-stream")
# Here is the handling of stream = False
response = generate_chatglm3(model, tokenizer, gen_params)
# Remove the first newline character
if response["text"].startswith("\n"):
response["text"] = response["text"][1:]
response["text"] = response["text"].strip()
usage = UsageInfo()
function_call, finish_reason = None, "stop"
if request.tools:
if request.functions:
try:
function_call = process_response(response["text"], use_tool=True)
except:
......@@ -325,14 +181,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
task_usage = UsageInfo.model_validate(response["usage"])
for usage_key, usage_value in task_usage.model_dump().items():
setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
return ChatCompletionResponse(
model=request.model,
id="", # for open_source model, id is empty
choices=[choice_data],
object="chat.completion",
usage=usage
)
return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
async def predict(model_id: str, params: dict):
......@@ -343,7 +192,7 @@ async def predict(model_id: str, params: dict):
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
......@@ -361,8 +210,7 @@ async def predict(model_id: str, params: dict):
try:
function_call = process_response(decoded_unicode, use_tool=True)
except:
logger.warning(
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
......@@ -378,12 +226,7 @@ async def predict(model_id: str, params: dict):
delta=delta,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
......@@ -391,141 +234,16 @@ async def predict(model_id: str, params: dict):
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def predict_stream(model_id, gen_params):
"""
The function call is compatible with stream mode output.
The first seven characters are determined.
If not a function call, the stream output is directly generated.
Otherwise, the complete character content of the function call is returned.
:param model_id:
:param gen_params:
:return:
"""
output = ""
is_function_call = False
has_send_first_chunk = False
for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(output):]
output = decoded_unicode
# When it is not a function call and the character length is> 7,
# try to judge whether it is a function call according to the special function prefix
if not is_function_call and len(output) > 7:
# Determine whether a function is called
is_function_call = contains_custom_function(output)
if is_function_call:
continue
# Non-function call, direct stream output
finish_reason = new_response["finish_reason"]
# Send an empty string first to avoid truncation by subsequent next() operations.
if not has_send_first_chunk:
message = DeltaMessage(
content="",
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
created=int(time.time()),
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
send_msg = delta_text if has_send_first_chunk else output
has_send_first_chunk = True
message = DeltaMessage(
content=send_msg,
role="assistant",
function_call=None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=message,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
created=int(time.time()),
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
if is_function_call:
yield output
else:
yield '[DONE]'
async def parse_output_text(model_id: str, value: str):
"""
Directly output the text content of value
:param model_id:
:param value:
:return:
"""
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant", content=value),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
def contains_custom_function(value: str) -> bool:
"""
Determine whether 'function_call' according to a special function prefix.
For example, the functions defined in "tools_using_demo/tool_register.py" are all "get_xxx" and start with "get_"
[Note] This is not a rigorous judgment method, only for reference.
:param value:
:return:
"""
return value and 'get_' in value
if __name__ == "__main__":
# Load LLM
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_PATH,
load_in_4bit=True,
trust_remote_code=True)
# load Embedding
# embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda")
if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
"""
This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model.
It includes functions to:
# 使用curl命令测试返回
# curl -X POST "http://127.0.0.1:8000/v1/chat/completions" \
# -H "Content-Type: application/json" \
# -d "{\"model\": \"chatglm3-6b\", \"messages\": [{\"role\": \"system\", \"content\": \"You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.\"}, {\"role\": \"user\", \"content\": \"你好,给我讲一个故事,大概100字\"}], \"stream\": false, \"max_tokens\": 100, \"temperature\": 0.8, \"top_p\": 0.8}"
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
# 使用Python代码测返回
import requests
import json
Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests
and handle responses.
"""
base_url = "http://127.0.0.1:8000"
from openai import OpenAI
base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url)
def create_chat_completion(model, messages, functions, use_stream=False):
data = {
"function": functions, # 函数定义
"model": model, # 模型名称
"messages": messages, # 会话历史
"stream": use_stream, # 是否流式响应
"max_tokens": 100, # 最多生成字数
"temperature": 0.8, # 温度
"top_p": 0.8, # 采样概率
}
response = requests.post(f"{base_url}/v1/chat/completions", json=data, stream=use_stream)
if response.status_code == 200:
if use_stream:
# 处理流式响应
for line in response.iter_lines():
if line:
decoded_line = line.decode('utf-8')[6:]
try:
response_json = json.loads(decoded_line)
content = response_json.get("choices", [{}])[0].get("delta", {}).get("content", "")
print(content)
except:
print("Special Token:", decoded_line)
else:
# 处理非流式响应
decoded_line = response.json()
content = decoded_line.get("choices", [{}])[0].get("message", "").get("content", "")
print(content)
else:
print("Error:", response.status_code)
return None
def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
def function_chat(use_stream=True):
functions = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
"name": "get_current_weather",
"description": "Get the current weather in a given location.",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. Beijing",
},
"required": ["location"],
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
}
]
response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
chat_messages = [
{
"role": "user",
"content": "波士顿天气如何?",
},
{
"role": "assistant",
"content": "get_current_weather\n ```python\ntool_call(location='Beijing', unit='celsius')\n```",
"function_call": {
"name": "get_current_weather",
"arguments": '{"location": "Beijing", "unit": "celsius"}',
},
},
{
"role": "function",
"name": "get_current_weather",
"content": '{"temperature": "12", "unit": "celsius", "description": "Sunny"}',
},
# ... 接下来这段是 assistant 的回复和用户的回复。
# {
# "role": "assistant",
# "content": "根据最新的天气预报,目前北京的天气情况是晴朗的,温度为12摄氏度。",
# },
# {
# "role": "user",
# "content": "谢谢",
# }
]
create_chat_completion("chatglm3-6b", messages=chat_messages, functions=functions, use_stream=use_stream)
def simple_chat(use_stream=True):
messages = [
functions = None
chat_messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's "
"instructions carefully. Respond using markdown.",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你用生动的话语给我讲一个故事"
"content": "你好,给我讲一个故事,大概100字"
}
]
response = client.chat.completions.create(
model="chatglm3-6b",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
presence_penalty=1.1,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["你好,给我讲一个故事,大概100字"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))
create_chat_completion("chatglm3-6b", messages=chat_messages, functions=functions, use_stream=use_stream)
if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()
function_chat(use_stream=False)
# simple_chat(use_stream=True)
openai>=1.3.0
pydantic>=2.5.1
\ No newline at end of file
import os
import gc
import json
import torch
from torch.nn import Module
from transformers import PreTrainedModel, PreTrainedTokenizer
from transformers import AutoModel
from transformers.generation.logits_process import LogitsProcessor
from typing import Union, Tuple
from typing import Dict, Union, Optional, Tuple
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
# transformer.word_embeddings 占用1层
# transformer.final_layernorm 和 lm_head 占用1层
# transformer.layers 占用 28 层
# 总共30层分配到num_gpus张卡上
num_trans_layers = 28
per_gpu_layers = 30 / num_gpus
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
# windows下 model.device 会被设置成 transformer.word_embeddings.device
# linux下 model.device 会被设置成 lm_head.device
# 在调用chat或者stream_chat时,input_ids会被放到model.device上
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
# 本文件来源于https://github.com/THUDM/ChatGLM-6B/blob/main/utils.py
# 仅此处做少许修改以支持ChatGLM3
device_map = {
'transformer.embedding.word_embeddings': 0,
'transformer.encoder.final_layernorm': 0,
'transformer.output_layer': 0,
'transformer.rotary_pos_emb': 0,
'lm_head': 0
}
used = 2
gpu_target = 0
for i in range(num_trans_layers):
if used >= per_gpu_layers:
gpu_target += 1
used = 0
assert gpu_target < num_gpus
device_map[f'transformer.encoder.layers.{i}'] = gpu_target
used += 1
return device_map
def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
if num_gpus < 2 and device_map is None:
model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
else:
from accelerate import dispatch_model
model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
if device_map is None:
device_map = auto_configure_device_map(num_gpus)
model = dispatch_model(model, device_map=device_map)
return model
class InvalidScoreLogitsProcessor(LogitsProcessor):
......@@ -46,13 +103,13 @@ def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
@torch.inference_mode()
def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
messages = params["messages"]
tools = params["tools"]
functions = params["functions"]
temperature = float(params.get("temperature", 1.0))
repetition_penalty = float(params.get("repetition_penalty", 1.0))
top_p = float(params.get("top_p", 1.0))
max_new_tokens = int(params.get("max_tokens", 256))
echo = params.get("echo", True)
messages = process_chatglm_messages(messages, tools=tools)
messages = process_chatglm_messages(messages, functions=functions)
query, role = messages[-1]["content"], messages[-1]["role"]
inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role)
......@@ -65,7 +122,6 @@ def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokeni
eos_token_id = [
tokenizer.eos_token_id,
tokenizer.get_command("<|user|>"),
tokenizer.get_command("<|observation|>")
]
gen_kwargs = {
......@@ -120,19 +176,17 @@ def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokeni
torch.cuda.empty_cache()
def process_chatglm_messages(messages, tools=None):
def process_chatglm_messages(messages, functions=None):
_messages = messages
messages = []
msg_has_sys = False
if tools:
if functions:
messages.append(
{
"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools
"tools": functions
}
)
msg_has_sys = True
for m in _messages:
role, content, func_call = m.role, m.content, m.function_call
......@@ -155,9 +209,6 @@ def process_chatglm_messages(messages, tools=None):
}
)
else:
if role == "system" and msg_has_sys:
msg_has_sys = False
continue
messages.append({"role": role, "content": content})
return messages
......
"""
This script is an example of using the Zhipu API to create various interactions with a ChatGLM3 model. It includes
functions to:
1. Conduct a basic chat session, asking about weather conditions in multiple cities.
2. Initiate a simple chat in Chinese, asking the model to tell a short story.
3. Retrieve and print embeddings for a given text input.
Each function demonstrates a different aspect of the API's capabilities,
showcasing how to make requests and handle responses.
Note: Make sure your Zhipu API key is set as an environment
variable formate as xxx.xxx (just for check, not need a real key).
"""
from zhipuai import ZhipuAI
base_url = "http://127.0.0.1:8000/v1/"
client = ZhipuAI(api_key="EMP.TY", base_url=base_url)
def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
response = client.chat.completions.create(
model="chatglm3_6b",
messages=messages,
tools=tools,
tool_choice="auto",
)
if response:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def simple_chat(use_stream=True):
messages = [
{
"role": "system",
"content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
"the user's instructions carefully. Respond using markdown.",
},
{
"role": "user",
"content": "你好,请你介绍一下chatglm3-6b这个模型"
}
]
response = client.chat.completions.create(
model="chatglm3_",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.8,
top_p=0.8)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
else:
content = response.choices[0].message.content
print(content)
else:
print("Error:", response.status_code)
def embedding():
response = client.embeddings.create(
model="bge-large-zh-1.5",
input=["ChatGLM3-6B 是一个大型的中英双语模型。"],
)
embeddings = response.data[0].embedding
print("嵌入完成,维度:", len(embeddings))
if __name__ == "__main__":
simple_chat(use_stream=False)
simple_chat(use_stream=True)
embedding()
function_chat()
# basic requirements
protobuf
pydantic==1.10.9
transformers==4.30.2
sentencepiece==0.1.99
accelerate==0.21.0
sse-starlette
astunparse==1.6.2
protobuf>=4.25.3
transformers>=4.39.3
tokenizers>=0.15.0
cpm_kernels>=1.0.11
torch>=2.1.0
gradio>=4.26.0
sentencepiece>=0.2.0
sentence_transformers>=2.4.0
accelerate>=0.29.2
streamlit>=1.33.0
fastapi>=0.110.0
loguru~=0.7.2
mdtex2html>=1.3.0
latex2mathml>=3.77.0
jupyter_client>=8.6.1
nltk
# for openai demo
#openai>=1.17.1
#zhipuai>=2.0.1
#pydantic>=2.7.0
#sse-starlette>=2.0.0
#uvicorn>=0.29.0
#timm>=0.9.16
#tiktoken>=0.6.0
# for langchain demo
#langchain>=0.1.16
#langchainhub>=0.1.15
#arxiv>=2.1.0
# 使用NVIDIA TensorRT-LLM部署ChatGLM3
[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架,您可以按照以下步骤来使用TensorRT-LLM部署ChatGLM3模型。
## 1. 安装TensorRT-LLM
#### 获取TensorRT-LLM代码:
```bash
# TensorRT-LLM 代码需要使用 git-lfs 拉取
apt-get update && apt-get -y install git git-lfs
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
# 本流程将使用 v0.7.0 Release 版本
git checkout tags/v0.7.0 -b release/0.7.0
git submodule update --init --recursive
git lfs install
git lfs pull
```
#### 构建docker镜像并安装TensorRT-LLM:
```bash
make -C docker release_build
```
#### 运行docker镜像:
```bash
make -C docker release_run
```
## 3. 为ChatGLM3模型构建TensorRT-LLM推理引擎:
#### 安装Python依赖:
```bash
cd ./examples/chatglm
pip install -r requirements.txt
apt-get update
apt-get install git-lfs
```
#### 从Huggingface下载ChatGLM3模型:
```
# 您可以选择具体想部署的模型下载
git clone https://huggingface.co/THUDM/chatglm3-6b chatglm3_6b
git clone https://huggingface.co/THUDM/chatglm3-6b-base chatglm3_6b_base
git clone https://huggingface.co/THUDM/chatglm3-6b-32k chatglm3_6b_32k
```
#### 使用build.py来构建推理引擎:
以下是一些使用build.py构建推理引擎的示例:
```bash
# 构建一个默认的精度为fp16的引擎
python3 build.py -m chatglm3_6b --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
# 构建一个默认的精度为fp16的引擎,并打开FMHA功能(详见下文)
python3 build.py -m chatglm3_6b --enable_context_fmha --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
# 构建一个w8a16的引擎
python3 build.py -m chatglm3_6b --use_weight_only --output_dir trt_engines/chatglm3_6b/weight_only/1-gpu
# 构建一个默认的精度为fp16的引擎,并支持使用两个GPU
python3 build.py -m chatglm3_6b --world_size 2 --output_dir trt_engines/chatglm3_6b/fp16/2-gpu
# 使用chatglm3_6b_base模型
python3 build.py -m chatglm3_6b_base --output_dir trt_engines/chatglm3_6b_base/fp16/1-gpu
# 使用chatglm3_6b-32k模型
python3 build.py -m chatglm3_6b_32k --output_dir trt_engines/chatglm3_6b-32k/fp16/1-gpu
```
#### 可配置的plugin参数
* 使用 `--use_gpt_attention_plugin <DataType>` 来配置 GPT Attention plugin (默认使用float16)。
* 使用 `--use_gemm_plugin <DataType>` 来配置 GEMM plugin (默认使用float16)。
* 使用 `--use_rmsnorm_plugin <DataType>` 来配置 RMS normolization plugin (默认使用float16)。
#### Fused Multi-Head Attention (FMHA)
* 使用 `--enable_context_fmha``--enable_context_fmha_fp32_acc` 参数来开启FMHA kernels, 可以获得更好的性能的同时降低显存开销。
* `--use_gpt_attention_plugin` 如果被设置为关闭的话将无法使用FMHA功能。
* `--enable_context_fmha` 将会使用FP16 accumulator, 可能会略微降低精度. 您也可以选择使用`--enable_context_fmha_fp32_acc` 来保护精度,但这会略微降低FMHA的性能提升。
#### Weight-Only 量化
* 使用 `--use_weight_only` 来开启 Weight-Only 量化, 这样可以加速推理并减少显存开销。
* 你还可以通过切换 `--weight_only_precision int8` 或者 `--weight_only_precision int4` 来选择具体是使用int8还是int4量化,默认为Int8。
#### In-flight Batching(须使用NVIDIA Triton进行推理)
* 使用 `--use_inflight_batching` 来开启 In-flight Batching,启用后,Paged KV Cache也会自动启用。
* Paged KV cache中block的数量可以用`--tokens_per_block` 来配置。
更多详细的功能和配置请参考:[TensorRT-LLM ChatGLM实现](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/chatglm)
## 3. 使用TensorRT-LLM Python Runtime进行推理
#### 单机单卡的推理示例:
```bash
python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
#### 单机多卡的推理示例:
```bash
mpirun -n 2 \
python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
--max_output_len 50 \
--tokenizer_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
* 如果您以root权限运行 `mpirun`,则可能需要添加 `--allow-run-as-root` 参数。
#### 运行summarize.py进行文章总结任务:
```bash
python3 ../summarize.py --test_trt_llm \
--hf_model_dir chatglm3_6b \
--engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
#### 运行我们提供的对话demo:[tensorrt_llm_cli_demo.py](tensorrt_llm_cli_demo.py):
```bash
python3 tensorrt_llm_cli_demo.py --tokenizer_dir chatglm3_6b --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
```
运行结果展示:
```
用户: what is your name?
ChatGLM3-6B:Hello, I am an assistant named ChatGLM3-6B, and you can call me assistant. What can I help you with??
用户: what is new in ChatGLM3-6B compared with ChatGLM2-6B?
ChatGLM3-6B:ChatGLM3-6B is an improved version of ChatGLM2-6B. Compared with ChatGLM2-6B, ChatGLM3-6B has the following improvements:
1. Enhanced language understanding capabilities: ChatGLM3-6B's language model is based on the GLM3-6B model, which has been pre-trained on more diverse and large-scale data, resulting in better language understanding and generation capabilities.
2. Improved generation ability: ChatGLM3-6B has improved the generation ability compared to ChatGLM2-6B. With more training data and optimization algorithms, ChatGLM3-6B can generate more coherent and natural-looking text.
3. Enhanced adaptability to different dialogue scenarios: ChatGLM3-6B has been trained on more diverse dialogue data, including dialogue scenarios with different languages, cultures, and styles, making it more adaptable to different dialogue scenarios.
4. New features and functions: ChatGLM3-6B also has some new features and functions, such as support for multiple choice questions, sentiment analysis, and entity recognition.
In short, ChatGLM3-6B is more advanced and capable than ChatGLM2-6B, and can better meet the needs of users in various scenarios..
```
#### 性能测试:
您可以在[这里](https://github.com/NVIDIA/TensorRT-LLM/tree/main/benchmarks/python)查阅到如何测试 TensorRT-LLM 上运行 ChatGLM3 的性能。
## 4. 使用NVIDIA Triton部署在线推理服务器
使用 NVIDIA Triton 可以部署高性能,高拓展性,高稳定性的推理服务,并且可以开启In-flight Batching功能提升实际推理服务时的吞吐。详见[In-flight Batching Triton Backend](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main/inflight_batcher_llm)
\ No newline at end of file
"""
This script is a part of a larger project for generating text using large language models.
It includes functionalities for finding engine files, parsing arguments, setting up configurations for different models,
and executing the generation process with various settings.
This script particularly supports models like ChatGLM3-6B and its variants,
handling quantization, serialization, and runtime aspects.
Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Modifications made by Yuxuan.Zhang @ ZhipuAI on 2023-12-24.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Modifications:
1. Removed input_file, tokenizer_type, and other parameters unrelated to dialogue. Set num_beams to 1.
2. Adapted single turn dialogue into ChatGLM3-6B template and implemented multi-turn conversations.
"""
import argparse
import json
import torch
import transformers
from pathlib import Path
from typing import List
import tensorrt_llm
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime import (GenerationSession, ModelConfig, SamplingConfig)
def find_engines(dir: Path, model_name: str = "*", dtype: str = "*", tp_size: str = "*", rank: str = "*") -> List[Path]:
"""
Searches for engine files matching a specified pattern within a directory.
This is typically used to locate compiled model files for efficient execution on specific hardware.
Parameters:
- dir: The directory to search.
- model_name, dtype, tp_size, rank:
Pattern matching parameters to filter engine files by model name, data type,
tensor parallel size, and rank respectively.
Returns:
- A list of Paths pointing to the engine files.
"""
template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine"
return list(dir.glob(template))
def parse_arguments(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('--model_name',
type=str,
choices=[
"chatglm3_6b",
"chatglm3_6b_base",
"chatglm3_6b_32k"
],
default="chatglm3_6b",
help='the name of the model')
parser.add_argument('--max_output_len', type=int, default=4096)
parser.add_argument('--engine_dir', type=str, default=None)
parser.add_argument('--tokenizer_dir', type=str, default=None)
parser.add_argument('--temperature', type=float, default=0.95)
parser.add_argument('--top_k', type=int, default=1)
parser.add_argument('--top_p', type=float, default=0.8)
parser.add_argument('--random_seed', type=int, default=2023)
parser.add_argument('--streaming', default=True, action='store_true')
args = parser.parse_args(args)
return args
def main():
"""
The main execution function of the script. It orchestrates the text generation process
by performing several key steps:
- Parses command-line arguments to configure model details, output specifications,
and other user-defined parameters.
- Loads the model configuration from a specified directory and prepares the environment for text generation
based on the model and hardware specifics.
- Sets up the generation session with the appropriate model, tokenizer, and runtime configurations.
- Enters a loop to continuously accept user input, generate text based on the provided prompts, and output
the model's responses.
- Handles special commands such as 'stop' to end the conversation and 'clear' to reset the chat history.
- Manages resources and ensures that the generated text is properly formatted and presented to the user.
The function is designed to be the entry point of the script, invoking all necessary components and managing the
flow of data and control throughout the execution.
"""
args = parse_arguments()
config_path = Path(args.engine_dir) / 'config.json'
with open(config_path, 'r') as f:
config = json.load(f)
dtype = config['builder_config']['precision']
max_output_len = min(config['builder_config']['max_output_len'], args.max_output_len)
use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
remove_input_padding = config['builder_config']['remove_input_padding']
tp_size = config['builder_config']['tensor_parallel']
pp_size = config['builder_config']['pipeline_parallel']
world_size = tp_size * pp_size
assert world_size == tensorrt_llm.mpi_world_size(), f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
max_output_len = min(max_output_len, args.max_output_len)
runtime_rank = tensorrt_llm.mpi_rank()
runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=world_size)
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
serialize_path = find_engines(
dir=Path(args.engine_dir),
model_name=args.model_name,
dtype=dtype,
tp_size=world_size,
rank=runtime_rank)[0]
tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_dir, trust_remote_code=True)
model_config = ModelConfig(vocab_size=config['builder_config']['vocab_size'],
num_layers=config['builder_config']['num_layers'],
num_heads=config['builder_config']['num_heads'] // tp_size,
num_kv_heads=(config['builder_config']['num_kv_heads'] + tp_size - 1) // tp_size,
hidden_size=config['builder_config']['hidden_size'] // tp_size,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=config['builder_config']['remove_input_padding'],
model_name=args.model_name,
paged_kv_cache=config['builder_config']['paged_kv_cache'],
quant_mode=QuantMode(config['builder_config']['quant_mode']),
dtype=dtype)
sampling_config = SamplingConfig(
end_id=tokenizer.eos_token_id,
pad_id=tokenizer.pad_token_id,
num_beams=1,
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p
)
sampling_config.random_seed = args.random_seed
with open(serialize_path, 'rb') as f:
engine_buffer = f.read()
session = GenerationSession
decoder = session(model_config, engine_buffer, runtime_mapping)
history = []
while True:
input_text_with_history = ""
max_input_len = config['builder_config']['max_input_len']
input_text = input("用户: ")
if input_text.lower() == 'stop':
break
if input_text.lower() == 'clear':
history = []
print("ChatGLM3-6B: 对话历史已清空")
continue
history.append(input_text)
for idx, content in enumerate(history):
if idx % 2 != 0:
input_text_with_history += "{}\n".format(content)
else:
input_text_with_history += "<|user|>{}\n<|assistant|>".format(content)
tokenized = tokenizer(
input_text_with_history,
return_tensors="pt",
padding=True,
return_length=True
)
input_ids = tokenized['input_ids'].int()
input_lengths = tokenized['length'].int()
max_input_len_real = torch.max(input_lengths)
if max_input_len_real > max_input_len:
input_ids = input_ids[:, :max_input_len]
input_lengths = torch.where(input_lengths > max_input_len, max_input_len, input_lengths)
else:
max_input_len = max_input_len_real
if remove_input_padding:
input_ids_no_padding = (torch.zeros(1, torch.sum(input_lengths), dtype=torch.int32))
lengths_acc = torch.cumsum(torch.cat([torch.IntTensor([0]), input_lengths]), dim=0)
for i in range(len(input_ids)):
input_ids_no_padding[0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor(
input_ids[i, max_input_len - input_lengths[i]:max_input_len])
input_ids = input_ids_no_padding
elif use_gpt_attention_plugin:
input_ids_padding_right = torch.zeros_like(input_ids) + sampling_config.end_id
for i, sample in enumerate(input_ids):
nPadding = 0
for token in sample:
if token == sampling_config.pad_id:
nPadding += 1
else:
break
input_ids_padding_right[i, :len(sample[nPadding:])] = sample[nPadding:]
input_ids = input_ids_padding_right
input_lengths = torch.tensor([input_ids.shape[-1]], dtype=torch.int32)
decoder.setup(1, max_input_len, max_output_len, 1)
output = decoder.decode(
input_ids.contiguous().cuda(),
input_lengths.contiguous().cuda(),
sampling_config,
output_sequence_lengths=True,
return_dict=True,
streaming=args.streaming
)
print("ChatGLM3-6B:", end="")
generated_text = ""
if args.streaming:
for output_item in output:
output_id = output_item["output_ids"]
output_sequence_lengths = output_item["sequence_lengths"]
output_id = output_id[0, 0, output_sequence_lengths[0, 0] - 1]
output_word = tokenizer.convert_ids_to_tokens(int(output_id))
output_word = output_word.replace("▁", " ")
output_word = tokenizer.convert_tokens_to_string(output_word)
print(output_word, end="", flush=True)
generated_text += output_word
print("\n")
else:
torch.cuda.synchronize()
output_ids = output["output_ids"][0]
output = output_ids[0, input_lengths.item():]
generated_text = tokenizer.decode(output, skip_special_tokens=True)
print(generated_text)
history.append(generated_text)
del decoder
print(f"Good bye!")
if __name__ == '__main__':
main()
"""
This demo script is designed for interacting with the ChatGLM3-6B in Function, to show Function Call capabilities.
"""
import os
import platform
import torch
from transformers import AutoTokenizer, AutoModel
import torch
MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
os_name = platform.system()
clear_command = 'cls' if os_name == 'Windows' else 'clear'
......@@ -26,79 +26,10 @@ def build_prompt(history):
return prompt
tools = [
{'name': 'track', 'description': '追踪指定股票的实时价格',
'parameters':
{
'type': 'object', 'properties':
{'symbol':
{
'description': '需要追踪的股票代码'
}
},
'required': []
}
}, {
'name': '/text-to-speech', 'description': '将文本转换为语音',
'parameters':
{
'type': 'object', 'properties':
{
'text':
{
'description': '需要转换成语音的文本'
},
'voice':
{
'description': '要使用的语音类型(男声、女声等)'
},
'speed': {
'description': '语音的速度(快、中等、慢等)'
}
}, 'required': []
}
},
{
'name': '/image_resizer', 'description': '调整图片的大小和尺寸',
'parameters': {'type': 'object',
'properties':
{
'image_file':
{
'description': '需要调整大小的图片文件'
},
'width':
{
'description': '需要调整的宽度值'
},
'height':
{
'description': '需要调整的高度值'
}
},
'required': []
}
},
{
'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片',
'parameters': {
'type': 'object', 'properties':
{
'food_name':
{
'description': '需要生成图片的食品名称'
}
},
'required': []
}
}
]
system_item = {
"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools
}
tools = [{'name': 'track', 'description': '追踪指定股票的实时价格', 'parameters': {'type': 'object', 'properties': {'symbol': {'description': '需要追踪的股票代码'}}, 'required': []}}, {'name': '/text-to-speech', 'description': '将文本转换为语音', 'parameters': {'type': 'object', 'properties': {'text': {'description': '需要转换成语音的文本'}, 'voice': {'description': '要使用的语音类型(男声、女声等)'}, 'speed': {'description': '语音的速度(快、中等、慢等)'}}, 'required': []}}, {'name': '/image_resizer', 'description': '调整图片的大小和尺寸', 'parameters': {'type': 'object', 'properties': {'image_file': {'description': '需要调整大小的图片文件'}, 'width': {'description': '需要调整的宽度值'}, 'height': {'description': '需要调整的高度值'}}, 'required': []}}, {'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片', 'parameters': {'type': 'object', 'properties': {'food_name': {'description': '需要生成图片的食品名称'}}, 'required': []}}]
system_item = {"role": "system",
"content": "Answer the following questions as best as you can. You have access to the following tools:",
"tools": tools}
def main():
past_key_values, history = None, [system_item]
......@@ -110,7 +41,7 @@ def main():
if query.strip() == "stop":
break
if query.strip() == "clear":
past_key_values, history = None, [system_item]
past_key_values, history = None, [system_item]
role = "user"
os.system(clear_command)
print("欢迎使用 ChatGLM3-6B 模型,输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
......@@ -121,9 +52,7 @@ def main():
print("")
if isinstance(response, dict):
role = "observation"
else:
role = "user"
if __name__ == "__main__":
main()
main()
\ No newline at end of file
......@@ -12,13 +12,13 @@ client = OpenAI(
api_key = "xxx"
)
tools = get_tools()
functions = get_tools()
def run_conversation(query: str, stream=False, tools=None, max_retry=5):
def run_conversation(query: str, stream=False, functions=None, max_retry=5):
params = dict(model="chatglm3", messages=[{"role": "user", "content": query}], stream=stream)
if tools:
params["tools"] = tools
if functions:
params["functions"] = functions
response = client.chat.completions.create(**params)
for _ in range(max_retry):
......@@ -74,7 +74,7 @@ def run_conversation(query: str, stream=False, tools=None, max_retry=5):
{
"role": "function",
"name": function_call.name,
"content": tool_response,
"content": tool_response, # 调用函数返回结果
}
)
......@@ -90,4 +90,4 @@ if __name__ == "__main__":
logger.info("\n=========== next conversation ===========")
query = "帮我查询北京的天气怎么样"
run_conversation(query, tools=tools, stream=True)
run_conversation(query, functions=functions, stream=True)
tools = [
{
"name": "track",
"description": "追踪指定股票的实时价格",
"parameters": {
"type": "object",
"properties": {
"symbol": {
"description": "需要追踪的股票代码"
}
},
"required": ['symbol']
}
},
{
"name": "text-to-speech",
"description": "将文本转换为语音",
"parameters": {
"type": "object",
"properties": {
"text": {
"description": "需要转换成语音的文本"
},
"voice": {
"description": "要使用的语音类型(男声、女声等)"
},
"speed": {
"description": "语音的速度(快、中等、慢等)"
}
},
"required": ['text']
}
}
]
system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
import os
import platform
from transformers import AutoTokenizer, AutoModel
import torch
MODEL_PATH = os.environ.get('MODEL_PATH', '../../chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
history = [system_info]
query = "帮我查询股票10111的价格"
response, history = model.chat(tokenizer, query, history=history)
print(response)
\ No newline at end of file
"""
这段代码是工具注册的部分,通过注册工具,让模型实现工具调用
"""
import inspect
import traceback
from copy import deepcopy
......@@ -40,7 +37,7 @@ def register_tool(func: callable):
tool_def = {
"name": tool_name,
"description": tool_description,
"parameters": tool_params
"params": tool_params
}
print("[registered tool] " + pformat(tool_def))
......@@ -65,7 +62,7 @@ def get_tools() -> dict:
return deepcopy(_TOOL_DESCRIPTIONS)
# tools Definitions
# Tool Definitions
@register_tool
def random_number_generator(
......
#!/bin/bash
python -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
while read requirement; do
python -m pip install --upgrade "$requirement" -i https://pypi.tuna.tsinghua.edu.cn/simple
done < requirements.txt
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment