Support GLM-4-0414

67ca83cf · Rayyyyy · 78ba9d16 · 67ca83cf · 78ba9d16 · 78ba9d16
Commit 67ca83cf authored Apr 17, 2025 by Rayyyyy
20 changed files
--- a/finetune/requirements.txt
+++ b/finetune/requirements.txt
+jieba>=0.42.1
+datasets>=2.20.0
+peft>=0.15.1
+deepspeed>=0.16.5
+nltk==3.8.1
+rouge_chinese==1.0.3
+ruamel.yaml>=0.18.6
+typer>=0.13.0
+tqdm>=4.67.0
--- a/finetune_demo/configs/ptuning_v2.yaml
+++ b/finetune_demo/configs/ptuning_v2.yaml
-data_config:
-  train_file: train.jsonl
-  val_file: dev.jsonl
-  test_file: dev.jsonl
-  num_proc: 1
-max_input_length: 128
-max_output_length: 128
-training_args:
-  # see `transformers.Seq2SeqTrainingArguments`
-  output_dir: ./output
-  max_steps: 3000
-  # needed to be fit for the dataset
-  learning_rate: 5e-4
-  # settings for data loading
-  per_device_train_batch_size: 4
-  dataloader_num_workers: 16
-  remove_unused_columns: false
-  # settings for saving checkpoints
-  save_strategy: steps
-  save_steps: 500
-  # settings for logging
-  log_level: info
-  logging_strategy: steps
-  logging_steps: 500
-  # settings for evaluation
-  per_device_eval_batch_size: 16
-  evaluation_strategy: steps
-  eval_steps: 500
-  # settings for optimizer
-  # adam_epsilon: 1e-6
-  # uncomment the following line to detect nan or inf values
-  # debug: underflow_overflow
-  predict_with_generate: true
-  # see `transformers.GenerationConfig`
-  generation_config:
-    max_new_tokens: 512
-  # set your absolute deepspeed path here
-  #deepspeed: ds_zero_3.json
-peft_config:
-  peft_type: PREFIX_TUNING
-  task_type: CAUSAL_LM
-  num_virtual_tokens: 512
-  num_attention_heads: 2
-  token_dim: 256
--- a/finetune_demo/inference.py
+++ b/finetune_demo/inference.py
-from pathlib import Path
-from typing import Annotated, Union
-
-import typer
-from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast
-)
-
-ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
-TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
-
-app = typer.Typer(pretty_exceptions_show_locals=False)
-
-
-def load_model_and_tokenizer(
-        model_dir: Union[str, Path], trust_remote_code: bool = True
-) -> tuple[ModelType, TokenizerType]:
-    model_dir = Path(model_dir).expanduser().resolve()
-    if (model_dir / 'adapter_config.json').exists():
-        model = AutoPeftModelForCausalLM.from_pretrained(
-            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
-        )
-        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
-    else:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
-        )
-        tokenizer_dir = model_dir
-    tokenizer = AutoTokenizer.from_pretrained(
-        tokenizer_dir, trust_remote_code=trust_remote_code, encode_special_tokens=True, use_fast=False
-    )
-    return model, tokenizer
-
-
-@app.command()
-def main(
-        model_dir: Annotated[str, typer.Argument(help='')],
-):
-    messages = [
-        {
-            "role": "system", "content": "",
-            "tools":
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": "create_calendar_event",
-                            "description": "Create a new calendar event",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {
-                                    "title": {
-                                        "type": "string",
-                                        "description": "The title of the event"
-                                    },
-                                    "start_time": {
-                                        "type": "string",
-                                        "description": "The start time of the event in the format YYYY-MM-DD HH:MM"
-                                    },
-                                    "end_time": {
-                                        "type": "string",
-                                        "description": "The end time of the event in the format YYYY-MM-DD HH:MM"
-                                    }
-                                },
-                                "required": [
-                                    "title",
-                                    "start_time",
-                                    "end_time"
-                                ]
-                            }
-                        }
-                    }
-                ]
-
-        },
-        {
-            "role": "user",
-            "content": "Can you help me create a calendar event for my meeting tomorrow? The title is \"Team Meeting\". It starts at 10:00 AM and ends at 11:00 AM."
-        },
-    ]
-    model, tokenizer = load_model_and_tokenizer(model_dir)
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_tensors="pt"
-    ).to(model.device)
-    generate_kwargs = {
-        "input_ids": inputs,
-        "max_new_tokens": 1024,
-        "do_sample": True,
-        "top_p": 0.8,
-        "temperature": 0.8,
-        "repetition_penalty": 1.2,
-        "eos_token_id": model.config.eos_token_id,
-    }
-    outputs = model.generate(**generate_kwargs)
-    response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).strip()
-    print("=========")
-    print(response)
-
-
-if __name__ == '__main__':
-    app()
--- a/finetune_demo/requirements.txt
+++ b/finetune_demo/requirements.txt
-jieba>=0.42.1
-datasets>=2.19.1
-peft>=0.11.0
-nltk==3.8.1
-ruamel.yaml==0.18.6
-rouge_chinese==1.0.3
\ No newline at end of file
--- a/finetune_demo/train.sh
+++ b/finetune_demo/train.sh
-#!/bin/bash
-export HIP_VISIBLE_DEVICES=1 # 可自行修改为指定显卡号
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export USE_MIOPEN_BATCHNORM=1
-
-python finetune.py ../data/AdvertiseGen/saves/ ../checkpoints/glm-4-9b-chat/ configs/lora.yaml
\ No newline at end of file
--- a/finetune_demo/train_dp.sh
+++ b/finetune_demo/train_dp.sh
-#!/bin/bash
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # 可自行修改为指定显卡号
-export HSA_FORCE_FINE_GRAIN_PCIE=1
-export USE_MIOPEN_BATCHNORM=1
-
-OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune.py ../data/AdvertiseGen/saves/ ../checkpoints/glm-4-9b-chat/ configs/lora.yaml
\ No newline at end of file
--- a/inference/README.md
+++ b/inference/README.md
+# Inference
+
+[中文阅读](README_zh.md)
+
+Please follow the steps in the document strictly to avoid unnecessary errors.
+
+## Device and dependency check
+
+### Install dependencies
+
+```shell
+pip install -r requirements.txt
+```
+
+### Related Inference Benchmark Data
+
+**All benchmark data in this document was collected under the hardware environment listed below. Actual memory usage and runtime may vary depending on your deployment setup. Please refer to your actual environment.**
+
+Test Hardware:
+
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Cmake 3.23.0
+ CUDA Version: 12.4
+ GPU Driver: 535.104.05
+ GPU: NVIDIA H100 80GB HBM3 * 8
+
+The following stress test results show memory usage and latency during inference. If multiple GPUs are used, "Memory Usage" refers to the maximum usage on a single GPU.
+
+#### GLM-4-32B-0414
+
+| Precision   | #GPUs | Memory Usage  | First Token Latency | Token Output Speed | Input Tokens |
+|-------------|-------|---------------|---------------------|-------------------|--------------|
+| BF16        | 1     | 68 GB         | 0.16s               | 24.4 tokens/s     | 1000         |
+| BF16        | 1     | 72 GB         | 1.37s               | 16.9 tokens/s     | 8000         |
+| BF16        | 2     | 50 GB         | 6.75s               | 8.1 tokens/s      | 32000        |
+| BF16        | 4     | 55 GB         | 37.83s              | 3.0 tokens/s      | 100000       |
+
+#### GLM-4-9B-0414
+
+| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
+|-----------|-------|---------------|----------------------|---------------------|---------------|
+| BF16      | 1     | 19 GB         | 0.05s                | 44.4 tokens/s       | 1000          |
+| BF16      | 1     | 25 GB         | 0.39s                | 39.0 tokens/s       | 8000          |
+| BF16      | 1     | 31 GB         | 2.29s                | 18.7 tokens/s       | 32000         |
+| BF16      | 1     | 55 GB         | 6.80s                | 14.1 tokens/s       | 100000        |
+
+#### GLM-4-9B-Chat-1M
+
+| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
+|-----------|-------|---------------|----------------------|---------------------|---------------|
+| BF16      | 1     | 75 GB         | 98.4s                | 2.3 tokens/s        | 200000        |
+
+#### GLM-4V-9B
+
+| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
+|-----------|-------|---------------|----------------------|---------------------|---------------|
+| BF16      | 1     | 28 GB         | 0.1s                 | 33.4 tokens/s       | 1000          |
+| BF16      | 1     | 33 GB         | 0.7s                 | 39.2 tokens/s       | 8000          |
+
+| Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
+|-----------|-------|---------------|----------------------|---------------------|---------------|
+| INT4      | 1     | 10 GB         | 0.1s                 | 28.7 tokens/s       | 1000          |
+| INT4      | 1     | 15 GB         | 0.8s                 | 24.2 tokens/s       | 8000          |
+
+## Quick Start
+
+### Use transformers backend code
+
+ Use the command line to communicate with the GLM-4-9B model.
+
+```shell
+python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
+python trans_cli_vision_demo.py # GLM-4V-9B
+```
+
+ Use the Gradio web client to communicate with the  GLM-4-9B model.
+
+```shell
+python trans_web_demo.py  # LLM Such as GLM-4-9B-0414
+python trans_web_vision_demo.py # GLM-4V-9B
+```
+
+ Use Batch inference.
+
+```shell
+python trans_batch_demo.py  # LLM Such as GLM-4-9B-0414
+```
+
+### Use vLLM backend code
+
+ Use the command line to communicate with the GLM-4-9B-Chat model.
+
+```shell
+python vllm_cli_demo.py  # LLM Such as GLM-4-9B-0414
+```
+
+ Launch an OpenAI-compatible API service.
+
+```shell
+vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
+```
+
+### Use glm-4v to build an OpenAI-compatible service
+
+Start the server:
+
+```shell
+python glm4v_server.py THUDM/glm-4v-9b
+```
+
+Client request:
+
+```shell
+python glm4v_api_request.py
+```
+
+## Stress test
+
+Users can use this code to test the generation speed of the model on the transformers backend on their own devices:
+
+```shell
+python trans_stress_test.py
+```
+
+The stress test script supports enabling **SwanLab** to track the stress testing process and record metrics:
+
+```shell
+# The API Key can be obtained by logging in to https://swanlab.cn/
+python trans_stress_test.py --swanlab_api_key "Your SwanLab API Key"
+```
+
+Using the --swanlab_api_key local parameter enables SwanLab's local mode.
+
+## Use Ascend card to run code
+
+Users can run the above code in the Ascend hardware environment. They only need to change the transformers to openmind and the cuda device in device to npu.
+
+```shell
+#from transformers import AutoModelForCausalLM, AutoTokenizer
+from openmind import AutoModelForCausalLM, AutoTokenizer
+
+#device = 'cuda'
+device = 'npu'
+```
--- a/inference/README_zh.md
+++ b/inference/README_zh.md
+# Inference
+
+Read this in [English](README.md)
+
+请严格按照文档的步骤进行操作，以避免不必要的错误。
+
+## 设备和依赖检查
+
+### 安装依赖
+
+```shell
+pip install -r requirements.txt
+```
+
+### 相关推理测试数据
+
+**本文档的数据均在以下硬件环境测试,实际运行环境需求和运行占用的显存略有不同，请以实际运行环境为准。**
+
+测试硬件信息:
+
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ CUDA Version:  12.4
+ Cmake 3.23.0
+ GPU Driver: 535.104.05
+ GPU: NVIDIA H100 80GB HBM3 * 8
+
+推理的压力测试数据如下，如有多张显卡，则显存占用代表显存占用最大一张显卡的显存消耗。
+
+#### GLM-4-32B-0414
+
+| 精度   | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
+|------|------|-------|------------|---------------|----------|
+| BF16 | 1    | 68 GB | 0.16s      | 24.4 tokens/s | 1000     |
+| BF16 | 1    | 72 GB | 1.37s      | 16.9 tokens/s | 8000     |
+| BF16 | 2    | 50 GB | 6.75s      | 8.1 tokens/s  | 32000    |
+| BF16 | 4    | 55 GB | 37.83s     | 3.0 tokens/s  | 100000   |
+
+#### GLM-4-9B-0414
+
+| 精度   | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
+|------|------|-------|------------|---------------|---------|
+| BF16 | 1    | 19 GB | 0.05s      | 44.4 tokens/s | 1000    |
+| BF16 | 1    | 25 GB | 0.39s      | 39.0 tokens/s | 8000    |
+| BF16 | 1    | 31 GB | 2.29s      | 18.7 tokens/s | 32000   |
+| BF16 | 1    | 55 GB | 6.80s      | 14.1 tokens/s  | 100000  |
+
+
+#### GLM-4-9B-Chat-1M
+
+| 精度     | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
+|--------|------|------|------------|--------------|-------------|
+| BF16 | 1    | 75 GB | 98.4s      | 2.3 tokens/s | 200000 |
+
+#### GLM-4V-9B
+
+| 精度     | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
+|--------|------|------|------------|--------------|-------------|
+| BF16 | 1    | 28 GB | 0.1s       | 33.4 tokens/s | 1000 |
+| BF16 | 1    | 33 GB | 0.7s       | 39.2 tokens/s | 8000 |
+
+| 精度     | 显卡数量  | 显存占用   | 首 Token 延迟 | Token 输出速度    | 输入token数 |
+|--------|-------|--------|------------|--------------|-------------|
+| INT4 | 1     | 10 GB  | 0.1s       | 28.7 tokens/s |  1000 |
+| INT4 | 1     | 15 GB  | 0.8s       | 24.2 tokens/s |  8000 |
+
+## 快速开始
+
+### 使用 transformers 后端代码
+
+ 使用命令行与 GLM-4-9B 模型进行对话。
+
+```shell
+python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
+python trans_cli_vision_demo.py # GLM-4V-9B
+```
+
+ 使用 Gradio 网页端与 GLM-4-9B 模型进行对话。
+
+```shell
+python trans_web_demo.py  # LLM Such as GLM-4-9B-0414
+python trans_web_vision_demo.py # GLM-4V-9B
+```
+
+ 使用 Batch 推理。
+
+```shell
+python trans_batch_demo.py
+```
+
+### 使用 vLLM 后端代码
+
+ 使用命令行与 GLM-4-9B-Chat 模型进行对话。
+
+```shell
+python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
+```
+
+ 构建 OpenAI 类 API 服务。
+```shell
+vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
+```
+
+### 使用 glm-4v 构建 OpenAI 服务
+
+启动服务端
+
+```shell
+python glm4v_server.py THUDM/glm-4v-9b
+```
+
+客户端请求：
+
+```shell
+python glm4v_api_request.py
+```
+
+## 压力测试
+
+用户可以在自己的设备上使用本代码测试模型在 transformers后端的生成速度:
+
+```shell
+python trans_stress_test.py
+```
+
+## 压力测试
+
+用户可以在自己的设备上使用本代码测试模型在 transformers后端的生成速度:
+
+```shell
+python trans_stress_test.py
+```
+
+压力测试脚本支持开启**SwanLab**来跟踪压力测试过程和记录指标：
+
+```shell
+# API Key 可通过登录https://swanlab.cn/获取
+python trans_stress_test.py --swanlab_api_key "SwanLab的API Key"
+
+```
+使用`--swanlab_api_key local`参数可开启SwanLab本地模式
+
+## 使用昇腾NPU运行代码
+
+用户可以在昇腾硬件环境下运行以上代码，只需将transformers修改为openmind，将device中的cuda设备修改为npu：
+
+```shell
+#from transformers import AutoModelForCausalLM, AutoTokenizer
+from openmind import AutoModelForCausalLM, AutoTokenizer
+
+#device = 'cuda'
+device = 'npu'
+```
--- a/inference/demo.jpg
+++ b/inference/demo.jpg
--- a/inference/glm4v_api_request.py
+++ b/inference/glm4v_api_request.py
+"""
+This script creates a OpenAI Request demo for the glm-4v-9b model, just Use OpenAI API to interact with the model.
+For LLM such as GLM-4-9B-0414, using with vLLM OpenAI Server.
+
+vllm serve THUDM/GLM-4-32B-0414 --tensor_parallel_size 4
+
+"""
+
+import base64
+
+from openai import OpenAI
+
+
+base_url = "http://127.0.0.1:8000/v1/"
+client = OpenAI(api_key="EMPTY", base_url=base_url)
+
+
+def create_chat_completion(messages, use_stream=False):
+    response = client.chat.completions.create(
+        model="glm-4v",
+        messages=messages,
+        stream=use_stream,
+        max_tokens=256,
+        temperature=0.4,
+        presence_penalty=1.2,
+        top_p=0.8,
+    )
+    if response:
+        if use_stream:
+            for chunk in response:
+                print(chunk)
+        else:
+            print(response)
+    else:
+        print("Error:", response.status_code)
+
+
+def encode_image(image_path):
+    """
+    Encodes an image file into a base64 string.
+    Args:
+        image_path (str): The path to the image file.
+
+    This function opens the specified image file, reads its content, and encodes it into a base64 string.
+    The base64 encoding is used to send images over HTTP as text.
+    """
+
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def glm4v_simple_image_chat(use_stream=False, img_path=None):
+    """
+    Facilitates a simple chat interaction involving an image.
+
+    Args:
+        use_stream (bool): Specifies whether to use streaming for chat responses.
+        img_path (str): Path to the image file to be included in the chat.
+
+    This function encodes the specified image and constructs a predefined conversation involving the image.
+    It then calls `create_chat_completion` to generate a response from the model.
+    The conversation includes asking about the content of the image and a follow-up question.
+    """
+
+    img_url = f"data:image/jpeg;base64,{encode_image(img_path)}"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What’s in this image?",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {"url": img_url},
+                },
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": "The image displays a wooden boardwalk extending through a vibrant green grassy wetland. The sky is partly cloudy with soft, wispy clouds, indicating nice weather. Vegetation is seen on either side of the boardwalk, and trees are present in the background, suggesting that this area might be a natural reserve or park designed for ecological preservation and outdoor recreation. The boardwalk allows visitors to explore the area without disturbing the natural habitat.",
+        },
+        {"role": "user", "content": "Do you think this is a spring or winter photo?"},
+    ]
+    create_chat_completion(messages=messages, use_stream=use_stream)
+
+
+if __name__ == "__main__":
+    glm4v_simple_image_chat(use_stream=False, img_path="demo.jpg")
--- a/inference/glm4v_server.py
+++ b/inference/glm4v_server.py
+import base64
+import gc
+import sys
+import threading
+import time
+from contextlib import asynccontextmanager
+from io import BytesIO
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
+import requests
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from peft import PeftModelForCausalLM
+from PIL import Image
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
+
+
+TORCH_TYPE = (
+    torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+)
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    An asynchronous context manager for managing the lifecycle of the FastAPI app.
+    It ensures that GPU memory is cleared after the app's lifecycle ends, which is essential for efficient resource management in GPU environments.
+    """
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+class ModelCard(BaseModel):
+    """
+    A Pydantic model representing a model card, which provides metadata about a machine learning model.
+    It includes fields like model ID, owner, and creation time.
+    """
+
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+
+
+class ImageUrl(BaseModel):
+    url: str
+
+
+class TextContent(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ImageUrlContent(BaseModel):
+    type: Literal["image_url"]
+    image_url: ImageUrl
+
+
+ContentItem = Union[TextContent, ImageUrlContent]
+
+
+class ChatMessageInput(BaseModel):
+    role: Literal["user", "assistant", "system"]
+    content: Union[str, List[ContentItem]]
+    name: Optional[str] = None
+
+
+class ChatMessageResponse(BaseModel):
+    role: Literal["assistant"]
+    content: str = None
+    name: Optional[str] = None
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessageInput]
+    temperature: Optional[float] = 0.8
+    top_p: Optional[float] = 0.8
+    max_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+    # Additional parameters
+    repetition_penalty: Optional[float] = 1.0
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessageResponse
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+
+
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    usage: Optional[UsageInfo] = None
+
+
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """
+    An endpoint to list available models. It returns a list of model cards.
+    This is useful for clients to query and understand what models are available for use.
+    """
+    model_card = ModelCard(id="GLM-4v-9b")
+    return ModelList(data=[model_card])
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+
+    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
+        raise HTTPException(status_code=400, detail="Invalid request")
+
+    gen_params = dict(
+        messages=request.messages,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens or 1024,
+        echo=False,
+        stream=request.stream,
+        repetition_penalty=request.repetition_penalty,
+    )
+
+    if request.stream:
+        generate = predict(request.model, gen_params)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+    response = generate_glm4v(model, tokenizer, gen_params)
+
+    usage = UsageInfo()
+
+    message = ChatMessageResponse(
+        role="assistant",
+        content=response["text"],
+    )
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=message,
+    )
+    task_usage = UsageInfo.model_validate(response["usage"])
+    for usage_key, usage_value in task_usage.model_dump().items():
+        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
+
+
+def predict(model_id: str, params: dict):
+    global model, tokenizer
+
+    choice_data = ChatCompletionResponseStreamChoice(index=0, delta=DeltaMessage(role="assistant"), finish_reason=None)
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    previous_text = ""
+    for new_response in generate_stream_glm4v(model, tokenizer, params):
+        decoded_unicode = new_response["text"]
+        delta_text = decoded_unicode[len(previous_text) :]
+        previous_text = decoded_unicode
+        delta = DeltaMessage(content=delta_text, role="assistant")
+        choice_data = ChatCompletionResponseStreamChoice(index=0, delta=delta)
+        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    choice_data = ChatCompletionResponseStreamChoice(index=0, delta=DeltaMessage())
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+
+def generate_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: dict):
+    """
+    Generates a response using the GLM-4v-9b model. It processes the chat history and image data, if any,
+    and then invokes the model to generate a response.
+    """
+
+    response = None
+
+    for response in generate_stream_glm4v(model, tokenizer, params):
+        pass
+    return response
+
+
+def process_history_and_images(
+    messages: List[ChatMessageInput],
+) -> Tuple[Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
+    """
+    Process history messages to extract text, identify the last user query,
+    and convert base64 encoded image URLs to PIL images.
+
+    Args:
+        messages(List[ChatMessageInput]): List of ChatMessageInput objects.
+    return: A tuple of three elements:
+             - The last user query as a string.
+             - Text history formatted as a list of tuples for the model.
+             - List of PIL Image objects extracted from the messages.
+    """
+
+    formatted_history = []
+    image_list = []
+    last_user_query = ""
+
+    for i, message in enumerate(messages):
+        role = message.role
+        content = message.content
+
+        if isinstance(content, list):  # text
+            text_content = " ".join(item.text for item in content if isinstance(item, TextContent))
+        else:
+            text_content = content
+
+        if isinstance(content, list):  # image
+            for item in content:
+                if isinstance(item, ImageUrlContent):
+                    image_url = item.image_url.url
+                    if image_url.startswith("data:image/jpeg;base64,"):
+                        base64_encoded_image = image_url.split("data:image/jpeg;base64,")[1]
+                        image_data = base64.b64decode(base64_encoded_image)
+                        image = Image.open(BytesIO(image_data)).convert("RGB")
+                    else:
+                        response = requests.get(image_url, verify=False)
+                        image = Image.open(BytesIO(response.content)).convert("RGB")
+                    image_list.append(image)
+
+        if role == "user":
+            if i == len(messages) - 1:  # 最后一条用户消息
+                last_user_query = text_content
+            else:
+                formatted_history.append((text_content, ""))
+        elif role == "assistant":
+            if formatted_history:
+                if formatted_history[-1][1] != "":
+                    assert False, f"the last query is answered. answer again. {formatted_history[-1][0]}, {formatted_history[-1][1]}, {text_content}"
+                formatted_history[-1] = (formatted_history[-1][0], text_content)
+            else:
+                assert False, "assistant reply before user"
+        else:
+            assert False, f"unrecognized role: {role}"
+
+    return last_user_query, formatted_history, image_list
+
+
+@torch.inference_mode()
+def generate_stream_glm4v(model: AutoModel, tokenizer: AutoTokenizer, params: dict):
+    uploaded = False
+    messages = params["messages"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_tokens", 256))
+    query, history, image_list = process_history_and_images(messages)
+
+    inputs = []
+    for idx, (user_msg, model_msg) in enumerate(history):
+        if idx == len(history) - 1 and not model_msg:
+            inputs.append({"role": "user", "content": user_msg})
+            if image_list and not uploaded:
+                inputs[-1].update({"image": image_list[0]})
+                uploaded = True
+            break
+        if user_msg:
+            inputs.append({"role": "user", "content": user_msg})
+        if model_msg:
+            inputs.append({"role": "assistant", "content": model_msg})
+    if len(image_list) >= 1:
+        inputs.append({"role": "user", "content": query, "image": image_list[0]})
+    else:
+        inputs.append({"role": "user", "content": query})
+
+    model_inputs = tokenizer.apply_chat_template(
+        inputs, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+    ).to(next(model.parameters()).device)
+
+    input_echo_len = len(model_inputs["input_ids"][0])
+    streamer = TextIteratorStreamer(tokenizer=tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = {
+        "repetition_penalty": repetition_penalty,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p if temperature > 1e-5 else 0,
+        "top_k": 1,
+        "streamer": streamer,
+        "eos_token_id": [151329, 151336, 151338],
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+
+    generated_text = ""
+
+    def generate_text():
+        with torch.no_grad():
+            model.generate(**model_inputs, **gen_kwargs)
+
+    generation_thread = threading.Thread(target=generate_text)
+    generation_thread.start()
+
+    total_len = input_echo_len
+    for next_text in streamer:
+        generated_text += next_text
+        total_len = len(tokenizer.encode(generated_text))
+        yield {
+            "text": generated_text,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": total_len - input_echo_len,
+                "total_tokens": total_len,
+            },
+        }
+    generation_thread.join()
+    print("\033[91m--generated_text\033[0m", generated_text)
+    yield {
+        "text": generated_text,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+    }
+
+
+gc.collect()
+torch.cuda.empty_cache()
+
+if __name__ == "__main__":
+    MODEL_PATH = sys.argv[1]
+    model_dir = Path(MODEL_PATH).expanduser().resolve()
+    if (model_dir / "adapter_config.json").exists():
+        import json
+
+        with open(model_dir / "adapter_config.json", "r", encoding="utf-8") as file:
+            config = json.load(file)
+        model = AutoModel.from_pretrained(
+            config.get("base_model_name_or_path"), device_map="auto", torch_dtype=TORCH_TYPE
+        )
+        model = PeftModelForCausalLM.from_pretrained(
+            model=model,
+            model_id=model_dir,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(config.get("base_model_name_or_path"), encode_special_tokens=True)
+        model.eval()
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, encode_special_tokens=True)
+        model = AutoModel.from_pretrained(
+            MODEL_PATH,
+            torch_dtype=TORCH_TYPE,
+            device_map="auto",
+        ).eval()
+
+    uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)
--- a/basic_demo/infer_glm4.py
+++ b/basic_demo/infer_glm4.py
--- a/inference/requirements.txt
+++ b/inference/requirements.txt
+transformers>=4.51.3
+sentencepiece>=0.2.0
+jinja2>=3.1.4
+pydantic>=2.11.1
+timm>=1.0.15
+tiktoken>=0.9.0
+numpy<2
+accelerate>=1.6.0
+sentence_transformers>=3.1.1
+gradio>=5.23.3
+openai>=1.70.0
+einops>=0.8.0
+pillow>=10.4.0
+sse-starlette>=2.1.3
+bitsandbytes>=0.44.1 # INT4 Loading, Not support for NPU
+peft>=0.15.0 # Using with finetune model
+swanlab>=0.5.5
+
+# git+https://github.com/vllm-project/vllm.git For vLLM
--- a/inference/trans_batch_demo.py
+++ b/inference/trans_batch_demo.py
+"""
+
+Here is an example of using batch request GLM-4-0414 Models and glm-4-9b-chat-hf models with the transformers library.,
+here you need to build the conversation format yourself and then call the batch function to make batch requests.
+Please note that in this demo, the memory consumption is significantly higher.
+
+"""
+
+from typing import Union
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
+
+
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto").eval()
+
+
+def process_model_outputs(inputs, outputs, tokenizer):
+    responses = []
+    for input_ids, output_ids in zip(inputs.input_ids, outputs):
+        response = tokenizer.decode(output_ids[len(input_ids) :], skip_special_tokens=True).strip()
+        responses.append(response)
+    return responses
+
+
+def batch(
+    model,
+    tokenizer,
+    messages: Union[str, list[str]],
+    max_input_tokens: int = 8192,
+    max_new_tokens: int = 8192,
+    num_beams: int = 1,
+    do_sample: bool = True,
+    top_p: float = 0.8,
+    temperature: float = 0.8,
+    logits_processor=None,
+):
+    if logits_processor is None:
+        logits_processor = LogitsProcessorList()
+    messages = [messages] if isinstance(messages, str) else messages
+    batched_inputs = tokenizer(
+        messages, return_tensors="pt", padding="max_length", truncation=True, max_length=max_input_tokens
+    ).to(model.device)
+
+    gen_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "num_beams": num_beams,
+        "do_sample": do_sample,
+        "top_p": top_p,
+        "temperature": temperature,
+        "logits_processor": logits_processor,
+        "eos_token_id": model.config.eos_token_id,
+    }
+    batched_outputs = model.generate(**batched_inputs, **gen_kwargs)
+    batched_response = process_model_outputs(batched_inputs, batched_outputs, tokenizer)
+    return batched_response
+
+
+if __name__ == "__main__":
+    batch_message = [
+        [
+            {"role": "user", "content": "我的爸爸和妈妈结婚为什么不能带我去"},
+            {"role": "assistant", "content": "因为他们结婚时你还没有出生"},
+            {"role": "user", "content": "我刚才的提问是"},
+        ],
+        [{"role": "user", "content": "你好，你是谁"}],
+    ]
+
+    batch_inputs = []
+    max_input_tokens = 128
+    for i, messages in enumerate(batch_message):
+        new_batch_input = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)[12:]
+        max_input_tokens = max(max_input_tokens, len(new_batch_input))
+        batch_inputs.append(new_batch_input)
+    gen_kwargs = {
+        "max_input_tokens": max_input_tokens,
+        "max_new_tokens": 256,
+        "do_sample": True,
+        "top_p": 0.8,
+        "temperature": 0.8,
+        "num_beams": 1,
+    }
+
+    batch_responses = batch(model, tokenizer, batch_inputs, **gen_kwargs)
+    for response in batch_responses:
+        print("=" * 10)
+        print(response)
--- a/basic_demo/trans_cli_demo.py
+++ b/basic_demo/trans_cli_demo.py
--- a/basic_demo/trans_cli_vision_demo.py
+++ b/basic_demo/trans_cli_vision_demo.py
--- a/inference/trans_stress_test.py
+++ b/inference/trans_stress_test.py
--- a/inference/trans_web_demo.py
+++ b/inference/trans_web_demo.py
--- a/inference/trans_web_vision_demo.py
+++ b/inference/trans_web_vision_demo.py
--- a/inference/vllm_cli_demo.py
+++ b/inference/vllm_cli_demo.py