Commit 67ca83cf authored by Rayyyyy's avatar Rayyyyy
Browse files

Support GLM-4-0414

parent 78ba9d16
jieba>=0.42.1
datasets>=2.20.0
peft>=0.15.1
deepspeed>=0.16.5
nltk==3.8.1
rouge_chinese==1.0.3
ruamel.yaml>=0.18.6
typer>=0.13.0
tqdm>=4.67.0
data_config:
train_file: train.jsonl
val_file: dev.jsonl
test_file: dev.jsonl
num_proc: 1
max_input_length: 128
max_output_length: 128
training_args:
# see `transformers.Seq2SeqTrainingArguments`
output_dir: ./output
max_steps: 3000
# needed to be fit for the dataset
learning_rate: 5e-4
# settings for data loading
per_device_train_batch_size: 4
dataloader_num_workers: 16
remove_unused_columns: false
# settings for saving checkpoints
save_strategy: steps
save_steps: 500
# settings for logging
log_level: info
logging_strategy: steps
logging_steps: 500
# settings for evaluation
per_device_eval_batch_size: 16
evaluation_strategy: steps
eval_steps: 500
# settings for optimizer
# adam_epsilon: 1e-6
# uncomment the following line to detect nan or inf values
# debug: underflow_overflow
predict_with_generate: true
# see `transformers.GenerationConfig`
generation_config:
max_new_tokens: 512
# set your absolute deepspeed path here
#deepspeed: ds_zero_3.json
peft_config:
peft_type: PREFIX_TUNING
task_type: CAUSAL_LM
num_virtual_tokens: 512
num_attention_heads: 2
token_dim: 256
from pathlib import Path
from typing import Annotated, Union
import typer
from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
PreTrainedModel,
PreTrainedTokenizer,
PreTrainedTokenizerFast
)
ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
app = typer.Typer(pretty_exceptions_show_locals=False)
def load_model_and_tokenizer(
model_dir: Union[str, Path], trust_remote_code: bool = True
) -> tuple[ModelType, TokenizerType]:
model_dir = Path(model_dir).expanduser().resolve()
if (model_dir / 'adapter_config.json').exists():
model = AutoPeftModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=trust_remote_code, device_map='auto'
)
tokenizer_dir = model.peft_config['default'].base_model_name_or_path
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir, trust_remote_code=trust_remote_code, device_map='auto'
)
tokenizer_dir = model_dir
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, trust_remote_code=trust_remote_code, encode_special_tokens=True, use_fast=False
)
return model, tokenizer
@app.command()
def main(
model_dir: Annotated[str, typer.Argument(help='')],
):
messages = [
{
"role": "system", "content": "",
"tools":
[
{
"type": "function",
"function": {
"name": "create_calendar_event",
"description": "Create a new calendar event",
"parameters": {
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The title of the event"
},
"start_time": {
"type": "string",
"description": "The start time of the event in the format YYYY-MM-DD HH:MM"
},
"end_time": {
"type": "string",
"description": "The end time of the event in the format YYYY-MM-DD HH:MM"
}
},
"required": [
"title",
"start_time",
"end_time"
]
}
}
}
]
},
{
"role": "user",
"content": "Can you help me create a calendar event for my meeting tomorrow? The title is \"Team Meeting\". It starts at 10:00 AM and ends at 11:00 AM."
},
]
model, tokenizer = load_model_and_tokenizer(model_dir)
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt"
).to(model.device)
generate_kwargs = {
"input_ids": inputs,
"max_new_tokens": 1024,
"do_sample": True,
"top_p": 0.8,
"temperature": 0.8,
"repetition_penalty": 1.2,
"eos_token_id": model.config.eos_token_id,
}
outputs = model.generate(**generate_kwargs)
response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).strip()
print("=========")
print(response)
if __name__ == '__main__':
app()
jieba>=0.42.1
datasets>=2.19.1
peft>=0.11.0
nltk==3.8.1
ruamel.yaml==0.18.6
rouge_chinese==1.0.3
\ No newline at end of file
#!/bin/bash
export HIP_VISIBLE_DEVICES=1 # 可自行修改为指定显卡号
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python finetune.py ../data/AdvertiseGen/saves/ ../checkpoints/glm-4-9b-chat/ configs/lora.yaml
\ No newline at end of file
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # 可自行修改为指定显卡号
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8 finetune.py ../data/AdvertiseGen/saves/ ../checkpoints/glm-4-9b-chat/ configs/lora.yaml
\ No newline at end of file
# Inference
[中文阅读](README_zh.md)
Please follow the steps in the document strictly to avoid unnecessary errors.
## Device and dependency check
### Install dependencies
```shell
pip install -r requirements.txt
```
### Related Inference Benchmark Data
**All benchmark data in this document was collected under the hardware environment listed below. Actual memory usage and runtime may vary depending on your deployment setup. Please refer to your actual environment.**
Test Hardware:
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Cmake 3.23.0
+ CUDA Version: 12.4
+ GPU Driver: 535.104.05
+ GPU: NVIDIA H100 80GB HBM3 * 8
The following stress test results show memory usage and latency during inference. If multiple GPUs are used, "Memory Usage" refers to the maximum usage on a single GPU.
#### GLM-4-32B-0414
| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
|-------------|-------|---------------|---------------------|-------------------|--------------|
| BF16 | 1 | 68 GB | 0.16s | 24.4 tokens/s | 1000 |
| BF16 | 1 | 72 GB | 1.37s | 16.9 tokens/s | 8000 |
| BF16 | 2 | 50 GB | 6.75s | 8.1 tokens/s | 32000 |
| BF16 | 4 | 55 GB | 37.83s | 3.0 tokens/s | 100000 |
#### GLM-4-9B-0414
| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
|-----------|-------|---------------|----------------------|---------------------|---------------|
| BF16 | 1 | 19 GB | 0.05s | 44.4 tokens/s | 1000 |
| BF16 | 1 | 25 GB | 0.39s | 39.0 tokens/s | 8000 |
| BF16 | 1 | 31 GB | 2.29s | 18.7 tokens/s | 32000 |
| BF16 | 1 | 55 GB | 6.80s | 14.1 tokens/s | 100000 |
#### GLM-4-9B-Chat-1M
| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
|-----------|-------|---------------|----------------------|---------------------|---------------|
| BF16 | 1 | 75 GB | 98.4s | 2.3 tokens/s | 200000 |
#### GLM-4V-9B
| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
|-----------|-------|---------------|----------------------|---------------------|---------------|
| BF16 | 1 | 28 GB | 0.1s | 33.4 tokens/s | 1000 |
| BF16 | 1 | 33 GB | 0.7s | 39.2 tokens/s | 8000 |
| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
|-----------|-------|---------------|----------------------|---------------------|---------------|
| INT4 | 1 | 10 GB | 0.1s | 28.7 tokens/s | 1000 |
| INT4 | 1 | 15 GB | 0.8s | 24.2 tokens/s | 8000 |
## Quick Start
### Use transformers backend code
+ Use the command line to communicate with the GLM-4-9B model.
```shell
python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
python trans_cli_vision_demo.py # GLM-4V-9B
```
+ Use the Gradio web client to communicate with the GLM-4-9B model.
```shell
python trans_web_demo.py # LLM Such as GLM-4-9B-0414
python trans_web_vision_demo.py # GLM-4V-9B
```
+ Use Batch inference.
```shell
python trans_batch_demo.py # LLM Such as GLM-4-9B-0414
```
### Use vLLM backend code
+ Use the command line to communicate with the GLM-4-9B-Chat model.
```shell
python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
```
+ Launch an OpenAI-compatible API service.
```shell
vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
```
### Use glm-4v to build an OpenAI-compatible service
Start the server:
```shell
python glm4v_server.py THUDM/glm-4v-9b
```
Client request:
```shell
python glm4v_api_request.py
```
## Stress test
Users can use this code to test the generation speed of the model on the transformers backend on their own devices:
```shell
python trans_stress_test.py
```
The stress test script supports enabling **SwanLab** to track the stress testing process and record metrics:
```shell
# The API Key can be obtained by logging in to https://swanlab.cn/
python trans_stress_test.py --swanlab_api_key "Your SwanLab API Key"
```
Using the --swanlab_api_key local parameter enables SwanLab's local mode.
## Use Ascend card to run code
Users can run the above code in the Ascend hardware environment. They only need to change the transformers to openmind and the cuda device in device to npu.
```shell
#from transformers import AutoModelForCausalLM, AutoTokenizer
from openmind import AutoModelForCausalLM, AutoTokenizer
#device = 'cuda'
device = 'npu'
```
# Inference
Read this in [English](README.md)
请严格按照文档的步骤进行操作,以避免不必要的错误。
## 设备和依赖检查
### 安装依赖
```shell
pip install -r requirements.txt
```
### 相关推理测试数据
**本文档的数据均在以下硬件环境测试,实际运行环境需求和运行占用的显存略有不同,请以实际运行环境为准。**
测试硬件信息:
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ CUDA Version: 12.4
+ Cmake 3.23.0
+ GPU Driver: 535.104.05
+ GPU: NVIDIA H100 80GB HBM3 * 8
推理的压力测试数据如下,如有多张显卡,则显存占用代表显存占用最大一张显卡的显存消耗。
#### GLM-4-32B-0414
| 精度 | 显卡数量 | 显存占用 | 首 Token 延迟 | Token 输出速度 | 输入token数 |
|------|------|-------|------------|---------------|----------|
| BF16 | 1 | 68 GB | 0.16s | 24.4 tokens/s | 1000 |
| BF16 | 1 | 72 GB | 1.37s | 16.9 tokens/s | 8000 |
| BF16 | 2 | 50 GB | 6.75s | 8.1 tokens/s | 32000 |
| BF16 | 4 | 55 GB | 37.83s | 3.0 tokens/s | 100000 |
#### GLM-4-9B-0414
| 精度 | 显卡数量 | 显存占用 | 首 Token 延迟 | Token 输出速度 | 输入token数 |
|------|------|-------|------------|---------------|---------|
| BF16 | 1 | 19 GB | 0.05s | 44.4 tokens/s | 1000 |
| BF16 | 1 | 25 GB | 0.39s | 39.0 tokens/s | 8000 |
| BF16 | 1 | 31 GB | 2.29s | 18.7 tokens/s | 32000 |
| BF16 | 1 | 55 GB | 6.80s | 14.1 tokens/s | 100000 |
#### GLM-4-9B-Chat-1M
| 精度 | 显卡数量 | 显存占用 | 首 Token 延迟 | Token 输出速度 | 输入token数 |
|--------|------|------|------------|--------------|-------------|
| BF16 | 1 | 75 GB | 98.4s | 2.3 tokens/s | 200000 |
#### GLM-4V-9B
| 精度 | 显卡数量 | 显存占用 | 首 Token 延迟 | Token 输出速度 | 输入token数 |
|--------|------|------|------------|--------------|-------------|
| BF16 | 1 | 28 GB | 0.1s | 33.4 tokens/s | 1000 |
| BF16 | 1 | 33 GB | 0.7s | 39.2 tokens/s | 8000 |
| 精度 | 显卡数量 | 显存占用 | 首 Token 延迟 | Token 输出速度 | 输入token数 |
|--------|-------|--------|------------|--------------|-------------|
| INT4 | 1 | 10 GB | 0.1s | 28.7 tokens/s | 1000 |
| INT4 | 1 | 15 GB | 0.8s | 24.2 tokens/s | 8000 |
## 快速开始
### 使用 transformers 后端代码
+ 使用命令行与 GLM-4-9B 模型进行对话。
```shell
python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
python trans_cli_vision_demo.py # GLM-4V-9B
```
+ 使用 Gradio 网页端与 GLM-4-9B 模型进行对话。
```shell
python trans_web_demo.py # LLM Such as GLM-4-9B-0414
python trans_web_vision_demo.py # GLM-4V-9B
```
+ 使用 Batch 推理。
```shell
python trans_batch_demo.py
```
### 使用 vLLM 后端代码
+ 使用命令行与 GLM-4-9B-Chat 模型进行对话。
```shell
python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
```
+ 构建 OpenAI 类 API 服务。
```shell
vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
```
### 使用 glm-4v 构建 OpenAI 服务
启动服务端
```shell
python glm4v_server.py THUDM/glm-4v-9b
```
客户端请求:
```shell
python glm4v_api_request.py
```
## 压力测试
用户可以在自己的设备上使用本代码测试模型在 transformers后端的生成速度:
```shell
python trans_stress_test.py
```
## 压力测试
用户可以在自己的设备上使用本代码测试模型在 transformers后端的生成速度:
```shell
python trans_stress_test.py
```
压力测试脚本支持开启**SwanLab**来跟踪压力测试过程和记录指标:
```shell
# API Key 可通过登录https://swanlab.cn/获取
python trans_stress_test.py --swanlab_api_key "SwanLab的API Key"
```
使用`--swanlab_api_key local`参数可开启SwanLab本地模式
## 使用昇腾NPU运行代码
用户可以在昇腾硬件环境下运行以上代码,只需将transformers修改为openmind,将device中的cuda设备修改为npu:
```shell
#from transformers import AutoModelForCausalLM, AutoTokenizer
from openmind import AutoModelForCausalLM, AutoTokenizer
#device = 'cuda'
device = 'npu'
```
"""
This script creates a OpenAI Request demo for the glm-4v-9b model, just Use OpenAI API to interact with the model.
For LLM such as GLM-4-9B-0414, using with vLLM OpenAI Server.
vllm serve THUDM/GLM-4-32B-0414 --tensor_parallel_size 4
"""
import base64
from openai import OpenAI
base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url)
def create_chat_completion(messages, use_stream=False):
response = client.chat.completions.create(
model="glm-4v",
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.4,
presence_penalty=1.2,
top_p=0.8,
)
if response:
if use_stream:
for chunk in response:
print(chunk)
else:
print(response)
else:
print("Error:", response.status_code)
def encode_image(image_path):
"""
Encodes an image file into a base64 string.
Args:
image_path (str): The path to the image file.
This function opens the specified image file, reads its content, and encodes it into a base64 string.
The base64 encoding is used to send images over HTTP as text.
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def glm4v_simple_image_chat(use_stream=False, img_path=None):
"""
Facilitates a simple chat interaction involving an image.
Args:
use_stream (bool): Specifies whether to use streaming for chat responses.
img_path (str): Path to the image file to be included in the chat.
This function encodes the specified image and constructs a predefined conversation involving the image.
It then calls `create_chat_completion` to generate a response from the model.
The conversation includes asking about the content of the image and a follow-up question.
"""
img_url = f"data:image/jpeg;base64,{encode_image(img_path)}"
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What’s in this image?",
},
{
"type": "image_url",
"image_url": {"url": img_url},
},
],
},
{
"role": "assistant",
"content": "The image displays a wooden boardwalk extending through a vibrant green grassy wetland. The sky is partly cloudy with soft, wispy clouds, indicating nice weather. Vegetation is seen on either side of the boardwalk, and trees are present in the background, suggesting that this area might be a natural reserve or park designed for ecological preservation and outdoor recreation. The boardwalk allows visitors to explore the area without disturbing the natural habitat.",
},
{"role": "user", "content": "Do you think this is a spring or winter photo?"},
]
create_chat_completion(messages=messages, use_stream=use_stream)
if __name__ == "__main__":
glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg")
import base64
import gc
import sys
import threading
import time
from contextlib import asynccontextmanager
from io import BytesIO
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
import requests
import torch
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from peft import PeftModelForCausalLM
from PIL import Image
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
TORCH_TYPE = (
torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
An asynchronous context manager for managing the lifecycle of the FastAPI app.
It ensures that GPU memory is cleared after the app's lifecycle ends, which is essential for efficient resource management in GPU environments.
"""
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
"""
A Pydantic model representing a model card, which provides metadata about a machine learning model.
It includes fields like model ID, owner, and creation time.
"""
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class ImageUrl(BaseModel):
url: str
class TextContent(BaseModel):
type: Literal["text"]
text: str
class ImageUrlContent(BaseModel):
type: Literal["image_url"]
image_url: ImageUrl
ContentItem = Union[TextContent, ImageUrlContent]
class ChatMessageInput(BaseModel):
role: Literal["user", "assistant", "system"]
content: Union[str, List[ContentItem]]
name: Optional[str] = None
class ChatMessageResponse(BaseModel):
role: Literal["assistant"]
content: str = None
name: Optional[str] = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessageInput]
temperature: Optional[float] = 0.8
top_p: Optional[float] = 0.8
max_tokens: Optional[int] = None
stream: Optional[bool] = False
# Additional parameters
repetition_penalty: Optional[float] = 1.0
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessageResponse
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
class UsageInfo(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0
completion_tokens: Optional[int] = 0
class ChatCompletionResponse(BaseModel):
model: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: Optional[UsageInfo] = None
@app.get("/v1/models", response_model=ModelList)
async def list_models():
"""
An endpoint to list available models. It returns a list of model cards.
This is useful for clients to query and understand what models are available for use.
"""
model_card = ModelCard(id="GLM-4v-9b")
return ModelList(data=[model_card])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
global model, tokenizer
if len(request.messages) < 1 or request.messages[-1].role == "assistant":
raise HTTPException(status_code=400, detail="Invalid request")
gen_params = dict(
messages=request.messages,
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens or 1024,
echo=False,
stream=request.stream,
repetition_penalty=request.repetition_penalty,
)
if request.stream:
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream")
response = generate_glm4v(model, tokenizer, gen_params)
usage = UsageInfo()
message = ChatMessageResponse(
role="assistant",
content=response["text"],
)
choice_data = ChatCompletionResponseChoice(
index=0,
message=message,
)
task_usage = UsageInfo.model_validate(response["usage"])
for usage_key, usage_value in task_usage.model_dump().items():
setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
def predict(model_id: str, params: dict):
global model, tokenizer
choice_data = ChatCompletionResponseStreamChoice(index=0, delta=DeltaMessage(role="assistant"), finish_reason=None)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
for new_response in generate_stream_glm4v(model, tokenizer, params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(previous_text) :]
previous_text = decoded_unicode
delta = DeltaMessage(content=delta_text, role="assistant")
choice_data = ChatCompletionResponseStreamChoice(index=0, delta=delta)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(index=0, delta=DeltaMessage())
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
def generate_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: dict):
"""
Generates a response using the GLM-4v-9b model. It processes the chat history and image data, if any,
and then invokes the model to generate a response.
"""
response = None
for response in generate_stream_glm4v(model, tokenizer, params):
pass
return response
def process_history_and_images(
messages: List[ChatMessageInput],
) -> Tuple[Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
"""
Process history messages to extract text, identify the last user query,
and convert base64 encoded image URLs to PIL images.
Args:
messages(List[ChatMessageInput]): List of ChatMessageInput objects.
return: A tuple of three elements:
- The last user query as a string.
- Text history formatted as a list of tuples for the model.
- List of PIL Image objects extracted from the messages.
"""
formatted_history = []
image_list = []
last_user_query = ""
for i, message in enumerate(messages):
role = message.role
content = message.content
if isinstance(content, list): # text
text_content = " ".join(item.text for item in content if isinstance(item, TextContent))
else:
text_content = content
if isinstance(content, list): # image
for item in content:
if isinstance(item, ImageUrlContent):
image_url = item.image_url.url
if image_url.startswith("data:image/jpeg;base64,"):
base64_encoded_image = image_url.split("data:image/jpeg;base64,")[1]
image_data = base64.b64decode(base64_encoded_image)
image = Image.open(BytesIO(image_data)).convert("RGB")
else:
response = requests.get(image_url, verify=False)
image = Image.open(BytesIO(response.content)).convert("RGB")
image_list.append(image)
if role == "user":
if i == len(messages) - 1: # 最后一条用户消息
last_user_query = text_content
else:
formatted_history.append((text_content, ""))
elif role == "assistant":
if formatted_history:
if formatted_history[-1][1] != "":
assert False, f"the last query is answered. answer again. {formatted_history[-1][0]}, {formatted_history[-1][1]}, {text_content}"
formatted_history[-1] = (formatted_history[-1][0], text_content)
else:
assert False, "assistant reply before user"
else:
assert False, f"unrecognized role: {role}"
return last_user_query, formatted_history, image_list
@torch.inference_mode()
def generate_stream_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: dict):
uploaded = False
messages = params["messages"]
temperature = float(params.get("temperature", 1.0))
repetition_penalty = float(params.get("repetition_penalty", 1.0))
top_p = float(params.get("top_p", 1.0))
max_new_tokens = int(params.get("max_tokens", 256))
query, history, image_list = process_history_and_images(messages)
inputs = []
for idx, (user_msg, model_msg) in enumerate(history):
if idx == len(history) - 1 and not model_msg:
inputs.append({"role": "user", "content": user_msg})
if image_list and not uploaded:
inputs[-1].update({"image": image_list[0]})
uploaded = True
break
if user_msg:
inputs.append({"role": "user", "content": user_msg})
if model_msg:
inputs.append({"role": "assistant", "content": model_msg})
if len(image_list) >= 1:
inputs.append({"role": "user", "content": query, "image": image_list[0]})
else:
inputs.append({"role": "user", "content": query})
model_inputs = tokenizer.apply_chat_template(
inputs, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
).to(next(model.parameters()).device)
input_echo_len = len(model_inputs["input_ids"][0])
streamer = TextIteratorStreamer(tokenizer=tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = {
"repetition_penalty": repetition_penalty,
"max_new_tokens": max_new_tokens,
"do_sample": True if temperature > 1e-5 else False,
"top_p": top_p if temperature > 1e-5 else 0,
"top_k": 1,
"streamer": streamer,
"eos_token_id": [151329, 151336, 151338],
}
if temperature > 1e-5:
gen_kwargs["temperature"] = temperature
generated_text = ""
def generate_text():
with torch.no_grad():
model.generate(**model_inputs, **gen_kwargs)
generation_thread = threading.Thread(target=generate_text)
generation_thread.start()
total_len = input_echo_len
for next_text in streamer:
generated_text += next_text
total_len = len(tokenizer.encode(generated_text))
yield {
"text": generated_text,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
}
generation_thread.join()
print("\033[91m--generated_text\033[0m", generated_text)
yield {
"text": generated_text,
"usage": {
"prompt_tokens": input_echo_len,
"completion_tokens": total_len - input_echo_len,
"total_tokens": total_len,
},
}
gc.collect()
torch.cuda.empty_cache()
if __name__ == "__main__":
MODEL_PATH = sys.argv[1]
model_dir = Path(MODEL_PATH).expanduser().resolve()
if (model_dir / "adapter_config.json").exists():
import json
with open(model_dir / "adapter_config.json", "r", encoding="utf-8") as file:
config = json.load(file)
model = AutoModel.from_pretrained(
config.get("base_model_name_or_path"), device_map="auto", torch_dtype=TORCH_TYPE
)
model = PeftModelForCausalLM.from_pretrained(
model=model,
model_id=model_dir,
)
tokenizer = AutoTokenizer.from_pretrained(config.get("base_model_name_or_path"), encode_special_tokens=True)
model.eval()
else:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, encode_special_tokens=True)
model = AutoModel.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
device_map="auto",
).eval()
uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)
transformers>=4.51.3
sentencepiece>=0.2.0
jinja2>=3.1.4
pydantic>=2.11.1
timm>=1.0.15
tiktoken>=0.9.0
numpy<2
accelerate>=1.6.0
sentence_transformers>=3.1.1
gradio>=5.23.3
openai>=1.70.0
einops>=0.8.0
pillow>=10.4.0
sse-starlette>=2.1.3
bitsandbytes>=0.44.1 # INT4 Loading, Not support for NPU
peft>=0.15.0 # Using with finetune model
swanlab>=0.5.5
# git+https://github.com/vllm-project/vllm.git For vLLM
"""
Here is an example of using batch request GLM-4-0414 Models and glm-4-9b-chat-hf models with the transformers library.,
here you need to build the conversation format yourself and then call the batch function to make batch requests.
Please note that in this demo, the memory consumption is significantly higher.
"""
from typing import Union
from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
MODEL_PATH = "THUDM/GLM-4-9B-0414"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto").eval()
def process_model_outputs(inputs, outputs, tokenizer):
responses = []
for input_ids, output_ids in zip(inputs.input_ids, outputs):
response = tokenizer.decode(output_ids[len(input_ids) :], skip_special_tokens=True).strip()
responses.append(response)
return responses
def batch(
model,
tokenizer,
messages: Union[str, list[str]],
max_input_tokens: int = 8192,
max_new_tokens: int = 8192,
num_beams: int = 1,
do_sample: bool = True,
top_p: float = 0.8,
temperature: float = 0.8,
logits_processor=None,
):
if logits_processor is None:
logits_processor = LogitsProcessorList()
messages = [messages] if isinstance(messages, str) else messages
batched_inputs = tokenizer(
messages, return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_tokens
).to(model.device)
gen_kwargs = {
"max_new_tokens": max_new_tokens,
"num_beams": num_beams,
"do_sample": do_sample,
"top_p": top_p,
"temperature": temperature,
"logits_processor": logits_processor,
"eos_token_id": model.config.eos_token_id,
}
batched_outputs = model.generate(**batched_inputs, **gen_kwargs)
batched_response = process_model_outputs(batched_inputs, batched_outputs, tokenizer)
return batched_response
if __name__ == "__main__":
batch_message = [
[
{"role": "user", "content": "我的爸爸和妈妈结婚为什么不能带我去"},
{"role": "assistant", "content": "因为他们结婚时你还没有出生"},
{"role": "user", "content": "我刚才的提问是"},
],
[{"role": "user", "content": "你好,你是谁"}],
]
batch_inputs = []
max_input_tokens = 128
for i, messages in enumerate(batch_message):
new_batch_input = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)[12:]
max_input_tokens = max(max_input_tokens, len(new_batch_input))
batch_inputs.append(new_batch_input)
gen_kwargs = {
"max_input_tokens": max_input_tokens,
"max_new_tokens": 256,
"do_sample": True,
"top_p": 0.8,
"temperature": 0.8,
"num_beams": 1,
}
batch_responses = batch(model, tokenizer, batch_inputs, **gen_kwargs)
for response in batch_responses:
print("=" * 10)
print(response)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment