Commit a851dd99 authored by dengjb's avatar dengjb
Browse files

update

parents
Pipeline #2877 canceled with stages
# Byte-compiled / optimized / DLL files
.idea
__pycache__/
*.py[codz]
*$py.class
.DS_Store
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Cursor
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
default_install_hook_types:
- pre-commit
- commit-msg
default_stages:
- pre-commit # Run locally
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix, --select, I]
- id: ruff-format
- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks:
- id: typos
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
hooks:
- id: pymarkdown
args: [fix]
\ No newline at end of file
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2025 Zhipu AI
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# Step3
## 论文
`
GLM-4.5: Reasoning, Coding, and Agentic Abililties
`
- https://z.ai/blog/glm-4.5
## 模型结构
在 GLM-4.5 系列模型中,我们采用了 MoE(专家混合)架构,这种架构能够显著提升训练和推理时的计算效率。我们在 MoE 层采用了 loss-free balance 路由和 sigmoid gate 机制。
<div align=center>
<img src="./asserts/model.png"/>
</div>
## 算法原理
我们的基础模型经历了几个训练阶段。在预训练期间,模型首先在 15T token 的通用预训练语料库上训练,然后在 7T token 的代码和推理语料库上训练。预训练后,我们引入了 Mid-Training 阶段来进一步提升模型在专有领域上的性能。
<div align=center>
<img src="./asserts/image.png"/>
</div>
## 环境配置
### 硬件需求
DCU型号:BW1000,节点数量:2 台,卡数:2*8 张。
### 通信配置
一、节点间基础通信
`在本地机器上配置以下内容:`
1、关闭防火墙:
```
systemctl stop firewalld # 若为centos
ufw disable # 若为Ubuntu
```
2、设置amd_iommu=on:
```
vim /etc/default/grub
```
<div align=center>
<img src="./figures/amd_iommu.png"/>
</div>
更新下配置:
```
grub2-mkconfig -o /boot/efi/EFI/rocky/grub.cfg
```
重启机器后校验是否生效(检查是否存在imxxx=pt):
```
BOOT_IMAGE=(hd0,gpt3)/vmlinuz-4.18.0-372.9.1.el8.x86_64 root=UUID=80974f58-7d23-49bb-bd8b-8e299eb0d188 ro crashkernel=auto rhgb quiet systemd.unified_cgroup_hierachy=1 systemd.unified_cgroup_hierarchy=1 amd_iommu=on iommu=pt
```
`在后面步骤启动的容器里面配置以下内容:`
```
apt update
apt install openssh-server -y
```
vim /etc/ssh/sshd_config # 修改下面PermitRootLogin为yes
```
# 取消以下4句命令的注释
RSAAuthentication yes #启用 RSA 认证
PubkeyAuthentication yes #启用公钥私钥配对认证方式
AuthorizedKeysFile ~/.ssh/authorized_keys #公钥文件路径(和下面生成的文件同)
PermitRootLogin yes #root能使用ssh登录
```
重启ssh服务,并设置开机启动:
```
service sshd restart
chkconfig sshd on
查看sshd状态:service ssh status
开启sshd服务:/etc/init.d/ssh restart
```
下面开始设置节点间免密通信的秘钥:
1、ssh-keygen生成秘钥
```
ssh-keygen -t ed25519 # 此处以ed25519为例,读者可自己设置为其它名字,遇到提问全部回车键确认
```
2、将需要使用的各个节点`~/.ssh/authorized_keys`里的秘钥收集复制到`~/.ssh/id_rsa.pub`,每个节点`~/.ssh/id_rsa.pub`里的所有秘钥最终一致。格式类似如下:
<div align=center>
<img src="./figures/id_rsa.png"/>
</div>
3、设置节点间的通信端口号
```
/usr/sbin/sshd -p 10085 # 不同节点可以设置不同的端口号,打通秘钥和端口号之后可以用ssh -p之类的命令验证节点间是否通信已经通畅,否则需检查前面步骤是否设置成功。
```
以上设置非标准步骤,不同服务器或集群存在明显差异,无法完全复制此过程,请读者根据自己机器的实际情况灵活采用,总体目标是开启amd_iommu、打通节点间的容器内可以直接免密登录。
二、ray相关通信
`在后面步骤启动的容器里面配置以下内容:`
```
vim ~/.bashrc
```
在脚本`.bashrc`最后面添加以下命令(以BW千卡集群为例):
```
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_HOST_IP=x.x.x.x
export NCCL_SOCKET_IFNAME=ib0
export GLOO_SOCKET_IFNAME=ib0
unset NCCL_ALGO
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export NCCL_NET_GDR_READ=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export LMSLIM_USE_LIGHTOP=0
#针对hycpu环境建议设置绑核操作:
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
#BW集群需要额外设置的环境变量:
export NCCL_NET_GDR_LEVEL=7
export NCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="topo-input.xml"
# 若为K100_AI卡,则添加以下信息(本步骤以BW卡为示例,故注释了以下信息。):
# export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44
```
其中`VLLM_HOST_IP``NCCL_SOCKET_IFNAME`需要替换成每个自己机器上查到的信息,每个节点的ip不同,查询方式如下:
```
通信口和ip查询方法:ifconfig
VLLM_HOST_IP: 节点本地通信口ip
NCCL_SOCKET_IFNAME和GLOO_SOCKET_IFNAME: 节点本地通信网口名
```
`示例:`
<div align=center>
<img src="./figures/ip.png"/>
</div>
带BW卡的集群VLLM_HOST_IP需要设置为ib网卡对应的IP,避免出现rccl超时问题:
<div align=center>
<img src="./figures/ip_bw.png"/>
</div>
注意:添加完以上信息后需要激活环境变量
```
source ~/.bashrc
```
`Tips:由于通信配置方面属于运维人员的专业内容,以上关于通信的配置建议读者联系运维人员进行配置。`
### Docker(方法一)
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250802-step3
docker run -it --name glm4.5_vllm --shm-size=1024G --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v $PWD/Step3_pytorch:/home/Step3_pytorch f0e4191089de /bin/bash
wget --content-disposition 'https://download.sourcefind.cn:65024/file/4/triton/DAS1.6/triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl'
pip install triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
```
### Dockerfile(方法二)
```
cd $PWD/Step3_pytorch/docker
docker build --no-cache -t glm4.5:latest .
docker run -it --name glm4.5_vllm --shm-size=1024G --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v $PWD/Step3_pytorch:/home/Step3_pytorch f0e4191089de /bin/bash
wget --content-disposition 'https://download.sourcefind.cn:65024/file/4/triton/DAS1.6/triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl'
pip install triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
```
## 数据集
`无`
## 训练
`无`
## 推理
预训练权重目录结构,将后续模型地址切换成实际模型权重地址:
```
/home/glm4.5_pytorch/
└── zai-org/GLM-4.5
```
### 多机多卡
启动ray集群
```
# 启动ray
# 启动主节点的ray, x.x.x.x 为前面步骤中ifconfig查到的主节点ip(VLLM_HOST_IP),--port为端口号,可以随意设置,保持主节点和其余节点端口号一致。
ray start --head --node-ip-address=x.x.x.x --port=6379 --num-gpus=8 --num-cpus=32
# 启动其它节点的ray,注意 x.x.x.x 为前面步骤中ifconfig查到的主节点ip(VLLM_HOST_IP)。
ray start --address='x.x.x.x:6379' --num-gpus=8 --num-cpus=32
# 可用ray status 查看ray的集群启动状态。
```
vLLM Deployment(vllm官方暂不支持AFD,只支持非分离模式部署):
```
#head节点执行
VLLM_USE_NN=0 VLLM_USE_FLASH_ATTN_PA=0 vllm serve /path/to/GLM4.5 \
--reasoning-parser glm4_moe \
--enable-auto-tool-choice \
--tool-call-parser glm4_moe \
--trust-remote-code \
--max-num-batched-tokens 4096 \
--distributed-executor-backend ray \
--dtype float16 \
-tp 16 \
--port $PORT_SERVING
```
- 客户端请求
```bash
python inference/api_request.py
```
更多资料可参考源项目中的[`README_zh`](./README_zh.md)
## result
example1:
- text: 请帮我查询一下北京的天气。
<div align=center>
<img src="./asserts/example.png"/>
</div>
- 输出结果:
<div align=center>
<img src="./asserts/results.png"/>
</div>
### 精度
DCU与GPU精度一致,推理框架:vllm。
## 应用场景
### 算法类别
`对话问答`
### 热点应用行业
`电商,教育,广媒`
## 预训练权重
huggingface权重下载地址为:
- [zai-org/GLM-4.5](https://huggingface.co/zai-org/GLM-4.5)
`注:建议加镜像源下载:export HF_ENDPOINT=https://hf-mirror.com`
## 源码仓库及问题反馈
- https://developer.sourcefind.cn/codes/modelzoo/glm4.5_pytorch
## 参考资料
- https://github.com/zai-org/GLM-4.5/tree/main
\ No newline at end of file
# GLM-4.5
[English Version](./README.md)
<div align="center">
<img src=resources/logo.svg width="15%"/>
</div>
<p align="center">
👋 加入我们的<a href="resources/WECHAT.md" target="_blank"> 微信群 </a><a href="https://discord.gg/QR7SARHRxK" target="_blank"> Discord </a>社区。
<br>
📖 查看GLM-4.5<a href="https://z.ai/blog/glm-4.5" target="_blank"> 技术博客 </a> 以及 <a href="https://zhipu-ai.feishu.cn/wiki/Gv3swM0Yci7w7Zke9E0crhU7n7D" target="_blank"> 智谱AI技术文档 </a>
<br>
📍 在<a href="https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5"> 智谱AI开放平台 </a>上使用GLM-4.5 API服务。
<br>
👉 一键体验 <a href="https://chat.z.ai" >GLM-4.5 </a>
</p>
## 模型介绍
**GLM-4.5** 系列模型是专为智能体设计的基础模型。GLM-4.5拥有 **3550** 亿总参数量,其中 **320** 亿活跃参数;GLM-4.5-Air 采用更紧凑的设计,拥有
**1060** 亿总参数量,其中 **120** 亿活跃参数。GLM-4.5模型统一了推理、编码和智能体能力,以满足智能体应用的复杂需求。
GLM-4.5 和 GLM-4.5-Air 都是混合推理模型,提供两种模式:用于复杂推理和工具使用的思考模式,以及用于即时响应的非思考模式。
我们已开源了 GLM-4.5 和 GLM-4.5-Air 的基础模型、混合推理模型以及混合推理模型的FP8版本。它们采用MIT开源许可证发布,可用于商业用途和二次开发。
在我们对12项行业标准基准的全面评估中,GLM-4.5表现卓越,得分 **63.2**,在所有专有和开源模型中排名**第3**
。值得注意的是,GLM-4.5-Air在保持优异效率的同时,仍取得了 **59.8** 的竞争性成绩。
![bench](resources/bench.png)
如需了解更多评估结果、展示案例和技术细节,请访问我们的 [技术博客](https://z.ai/blog/glm-4.5)。技术报告将很快发布。
模型代码、工具解析器和推理解析器可在 [transformers](https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4_moe)[vLLM](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/glm4_moe_mtp.py)
[SGLang](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/glm4_moe.py) 的找到具体实现。
## 模型下载
你可以在 [Hugging Face](https://huggingface.co/spaces/zai-org/GLM-4.5-Space)
[ModelScope](https://modelscope.cn/studios/ZhipuAI/GLM-4.5-Demo) 上直接体验模型,也可以按照下面的链接下载模型。
| 模型 | 下载链接 | 模型大小 | 精度 |
|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-----------|------|
| GLM-4.5 | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5) | 355B-A32B | BF16 |
| GLM-4.5-Air | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air) | 106B-A12B | BF16 |
| GLM-4.5-FP8 | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-FP8)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-FP8) | 355B-A32B | FP8 |
| GLM-4.5-Air-FP8 | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air-FP8)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-FP8) | 106B-A12B | FP8 |
| GLM-4.5-Base | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Base)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Base) | 355B-A32B | BF16 |
| GLM-4.5-Air-Base | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air-Base)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-Base) | 106B-A12B | BF16 |
## 系统要求
### 推理
我们提供了"全功能"模型推理的最低和推荐配置。下表中的数据基于以下条件:
1. 所有模型都使用MTP层,并指定`--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4`
以确保具有竞争力的推理速度。
2. 不使用 `cpu-offload` 参数。
3. 推理批处理大小不超过 `8`
4. 所有操作都在原生支持FP8推理的设备上执行,确保权重和缓存都采用FP8格式。
5. 服务器内存必须超过 `1T` 以确保正常的模型加载和运行。
模型可在下表配置下运行:
| 模型 | 精度 | GPU类型和数量 | 测试框架 |
|-------------|------|----------------------|--------|
| GLM-4.5 | BF16 | H100 x 16 / H200 x 8 | sglang |
| GLM-4.5 | FP8 | H100 x 8 / H200 x 4 | sglang |
| GLM-4.5-Air | BF16 | H100 x 4 / H200 x 2 | sglang |
| GLM-4.5-Air | FP8 | H100 x 2 / H200 x 1 | sglang |
在下表配置下,模型可以充分利用其128K上下文长度:
| 模型 | 精度 | GPU类型和数量 | 测试框架 |
|-------------|------|-----------------------|--------|
| GLM-4.5 | BF16 | H100 x 32 / H200 x 16 | sglang |
| GLM-4.5 | FP8 | H100 x 16 / H200 x 8 | sglang |
| GLM-4.5-Air | BF16 | H100 x 8 / H200 x 4 | sglang |
| GLM-4.5-Air | FP8 | H100 x 4 / H200 x 2 | sglang |
### 微调
使用 [Llama Factory](https://github.com/hiyouga/LLaMA-Factory) 框架,代码可在下表配置下运行:
| 模型 | GPU类型和数量 | 策略 | 批处理大小(每GPU) |
|-------------|-----------|------|-------------|
| GLM-4.5 | H100 x 16 | Lora | 1 |
| GLM-4.5-Air | H100 x 4 | Lora | 1 |
使用 [Swift](https://github.com/modelscope/ms-swift) 框架,代码可在下表配置下运行:
| 模型 | GPU类型和数量 | 策略 | 批处理大小(每GPU) |
|-------------|-------------------|------|-------------|
| GLM-4.5 | H20 (96GiB) x 16 | Lora | 1 |
| GLM-4.5-Air | H20 (96GiB) x 4 | Lora | 1 |
| GLM-4.5 | H20 (96GiB) x 128 | SFT | 1 |
| GLM-4.5-Air | H20 (96GiB) x 32 | SFT | 1 |
| GLM-4.5 | H20 (96GiB) x 128 | RL | 1 |
| GLM-4.5-Air | H20 (96GiB) x 32 | RL | 1 |
## 快速开始
请根据`requirements.txt`安装所需的包。
```shell
pip install -r requirements.txt
```
### transformers
请参考 `inference` 文件夹中的 `trans_infer_cli.py` 代码。
### vLLM
+ BF16和FP8都可以用以下代码启动:
```shell
vllm serve zai-org/GLM-4.5-Air \
--tensor-parallel-size 8 \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--enable-auto-tool-choice \
--served-model-name glm-4.5-air
```
如果您使用8x H100 GPU并且在运行GLM-4.5模型时遇到内存不足的问题,您需要使用`--cpu-offload-gb 16`(仅适用于vLLM)。
如果遇到`flash infer`问题,请使用`VLLM_ATTENTION_BACKEND=XFORMERS`作为临时替代方案。您也可以指定
`TORCH_CUDA_ARCH_LIST='9.0+PTX'`来使用`flash infer`(不同GPU有不同的TORCH_CUDA_ARCH_LIST值,请相应检查)。
### SGLang
+ BF16
```shell
python3 -m sglang.launch_server \
--model-path zai-org/GLM-4.5-Air \
--tp-size 8 \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--speculative-algorithm EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.7 \
--served-model-name glm-4.5-air \
--host 0.0.0.0 \
--port 8000
```
+ FP8
```shell
python3 -m sglang.launch_server \
--model-path zai-org/GLM-4.5-Air-FP8 \
--tp-size 4 \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--speculative-algorithm EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.7 \
--disable-shared-experts-fusion \
--served-model-name glm-4.5-air-fp8 \
--host 0.0.0.0 \
--port 8000
```
### 请求参数说明
+ 使用`vLLM``SGLang`时,发送请求时默认启用思考模式。如果要禁用思考开关,需要添加
`extra_body={"chat_template_kwargs": {"enable_thinking": False}}`参数。
+ 两者都支持工具调用。请使用OpenAI风格的工具描述格式进行调用。
+ 具体代码请参考`inference`文件夹中的`api_request.py`
# Setting up Claude Code Service with SGLang + GLM-4.5 Model
[中文阅读](./README_zh.md)
## Installation
You need to have a local computer device for programming and a server for running the `GLM-4.5` model.
### Local Device
Ensure you have installed [Claude Code](https://github.com/anthropics/claude-code)
and [Claude Code Router](https://github.com/musistudio/claude-code-router).
```
npm install -g @anthropic-ai/claude-code
npm install -g @musistudio/claude-code-router
```
### Server
Ensure you have installed `sglang` on your server.
```shell
pip install sglang
```
And start the model service with the following command:
```shell
python3 -m sglang.launch_server \
--model-path zai-org/GLM-4.5 \
--tp-size 16 \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--speculative-algorithm EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.7 \
--served-model-name glm-4.5 \
--port 8000 \
--host 0.0.0.0 # Or your server's internal/public IP address
```
When successful, you will see output similar to the following:
```
[2025-07-26 16:09:07] INFO: Started server process [80269]
[2025-07-26 16:09:07] INFO: Waiting for application startup.
[2025-07-26 16:09:07] INFO: Application startup complete.
[2025-07-26 16:09:07] INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
[2025-07-26 16:09:08] INFO: 127.0.0.1:57722 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-07-26 16:09:11] INFO: 127.0.0.1:57732 - "POST /generate HTTP/1.1" 200 OK
[2025-07-26 16:09:11] The server is fired up and ready to roll!
```
Please ensure your server's IP can be accessed from the device where Claude Code and Claude Code Router are installed.
## Configuration
1. Modify the `api_base_url` in `config.example.json` to your server's accessible address.
2. Rename it to `config.json`
3. Copy it to `~/.claude-code-router/config.json`.
4. Execute the `ccr restart` command in the command line. You should see the following output:
```
Service was not running or failed to stop.
Starting claude code router service...
✅ Service started successfully in the background.
```
## Running
Execute the following command:
```
ccr code
```
It should looks like:
```
zr@MacBook GLM-4.5 % ccr code
Service not running, starting service...
╭───────────────────────────────────────────────────╮
Welcome to Claude Code! │
/help for help, /status for your current setup
cwd: /Users/zr/Code/GLM-4.5
───────────────────────────────────────────────
Overrides (via env):
API timeout: 600000ms
API Base URL: http://127.0.0.1:3456
╰───────────────────────────────────────────────────╯
Tip: Press Esc twice to edit your previous messages
> how can I run GLM-4.5 in transformers
I'll help you understand how to run GLM-4.5 in transformers. Let me first explore the codebase to see what's available.
List(.)
 Listed 21 paths (ctrl+r to expand)
Read(README.md)
 Read 4 lines (ctrl+r to expand)
Read(inference/trans_infer_cli.py)
 Read 29 lines (ctrl+r to expand)
Read(requirements.txt)
 Read 5 lines (ctrl+r to expand)
You can run GLM-4.5 in transformers using the example script in inference/trans_infer_cli.py. Here's the basic approach:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
MODEL_PATH = "zai-org/GLM-4.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map="auto"
)
Then use the tokenizer's chat template for inference.
╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ > 
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
? for shortcuts
```
# 使用 SGLang + GLM-4.5 模型搭建Claude Code服务
[Read this in English.](./README.md)
## 安装
你需要拥有一台本地的电脑设备,这是你的编程设备,和一台服务器用于运行`GLM-4.5`模型。
### 本地设备
确保您已安装 [Claude Code](https://github.com/anthropics/claude-code)
[Claude Code Router](https://github.com/musistudio/claude-code-router)
```
npm install -g @anthropic-ai/claude-code
npm install -g @musistudio/claude-code-router
```
### 服务器
确保你再服务器上已安装`sglang`
```shell
pip install sglang
```
并使用如下命令启动模型服务:
```shell
python3 -m sglang.launch_server \
--model-path zai-org/GLM-4.5 \
--tp-size 16 \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--speculative-algorithm EAGLE \
--speculative-num-steps 3 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 4 \
--mem-fraction-static 0.7 \
--served-model-name glm-4.5 \
--port 8000 \
--host 0.0.0.0 # 或者你服务器的内网/公网IP地址
```
运行成功时, 你将会看到类似如下输出:
```
[2025-07-26 16:09:07] INFO: Started server process [80269]
[2025-07-26 16:09:07] INFO: Waiting for application startup.
[2025-07-26 16:09:07] INFO: Application startup complete.
[2025-07-26 16:09:07] INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
[2025-07-26 16:09:08] INFO: 127.0.0.1:57722 - "GET /get_model_info HTTP/1.1" 200 OK
[2025-07-26 16:09:08 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0,
[2025-07-26 16:09:11] INFO: 127.0.0.1:57732 - "POST /generate HTTP/1.1" 200 OK
[2025-07-26 16:09:11] The server is fired up and ready to roll!
```
请确保你的服务器的IP能被你 Claude Code 和 Claude Code Router 安装的设备上访问。
## 配置
1.`config.example.json` 中的 `api_base_url` 修改为服务器的可访问地址。
2. 重命名为 `config.json`
3. 复制到 `~/.claude-code-router/config.json` 中。
4. 再命令行执行`ccr restart` 命令。得到如下输出
```
Service was not running or failed to stop.
Starting claude code router service...
✅ Service started successfully in the background.
```
## 运行
执行以下命令:
```
ccr code
```
即可正常运行,效果如下:
```
zr@MacBook GLM-4.5 % ccr code
Service not running, starting service...
╭───────────────────────────────────────────────────╮
Welcome to Claude Code! │
/help for help, /status for your current setup
cwd: /Users/zr/Code/GLM-4.5
───────────────────────────────────────────────
Overrides (via env):
API timeout: 600000ms
API Base URL: http://127.0.0.1:3456
╰───────────────────────────────────────────────────╯
Tip: Press Esc twice to edit your previous messages
> how can I run GLM-4.5 in transformers
I'll help you understand how to run GLM-4.5 in transformers. Let me first explore the codebase to see what's available.
List(.)
 Listed 21 paths (ctrl+r to expand)
Read(README.md)
 Read 4 lines (ctrl+r to expand)
Read(inference/trans_infer_cli.py)
 Read 29 lines (ctrl+r to expand)
Read(requirements.txt)
 Read 5 lines (ctrl+r to expand)
You can run GLM-4.5 in transformers using the example script in inference/trans_infer_cli.py. Here's the basic approach:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
MODEL_PATH = "zai-org/GLM-4.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map="auto"
)
Then use the tokenizer's chat template for inference.
╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ > 
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
? for shortcuts
```
{
"LOG": true,
"Providers": [
{
"name": "glm-4.5-sglang",
"api_base_url": "http://127.0.0.1:8000/v1/chat/completions",
"api_key": "EMPTY",
"models": [
"glm-4.5"
]
}
],
"Router": {
"default": "glm-4.5-sglang,glm-4.5",
"background": "glm-4.5-sglang,glm-4.5",
"think": "glm-4.5-sglang,glm-4.5",
"longContext": "glm-4.5-sglang,glm-4.5",
"webSearch": "glm-4.5-sglang,glm-4.5"
}
}
\ No newline at end of file
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://127.0.0.1:8000/v1"
import json
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current temperature for a given location.",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City and country e.g. Bogotá, Colombia",
}
},
"required": ["location"],
"additionalProperties": False,
},
},
},
{
"type": "function",
"function": {
"name": "return_delivered_order_items",
"description": "Return items from a delivered order",
"parameters": {
"type": "object",
"properties": {
"order_id": {
"type": "string",
"description": "The order ID, e.g. #W4794911",
},
"item_ids": {
"type": "array",
"items": {"type": "string"},
"description": "List of item IDs to return",
},
"payment_method_id": {
"type": "string",
"description": "Payment method ID for processing the return, e.g. paypal_7503218",
},
},
"required": ["order_id", "item_ids", "payment_method_id"],
"additionalProperties": False,
},
},
},
]
tools_messages = [
{
"role": "tool",
"tool_call_id": "tool-call-bf208d1d-9b5f-407f-8c6e-c35e54aa2fef",
"content": '{"city": "北京", "date": "2024-06-27", "weather": "晴", "temperature": "26C"}',
},
]
messages = [
{"role": "system", "content": "请你调用工具,用中文回答问题。"},
{"role": "user", "content": "请帮我查询一下北京的天气。"},
]
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
completion = client.chat.completions.create(
model="zai-org/GLM-4.5",
messages=messages,
max_tokens=4096,
temperature=0.0,
# extra_body={"chat_template_kwargs": {"enable_thinking": False}} # Uncomment this line to disable thinking
)
tool_call = completion.choices[0].message.tool_calls[0]
args = json.loads(tool_call.function.arguments)
print("===== TOOL CALL =====")
print(tool_call)
messages.append(completion.choices[0].message)
messages.append(tools_messages[0])
## This part is to simulate the tool response
completion_2 = client.chat.completions.create(
model="zai-org/GLM-4.5",
messages=messages,
tools=tools,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
print("===== RESPONSE =====")
print(completion_2.choices[0].message.content)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_PATH = "zai-org/GLM-4.5"
messages = [{"role": "user", "content": "你好"}]
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map="auto",
)
inputs = inputs.to(model.device)
generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
output_text = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1] :])
print(output_text)
transformers>=4.54.0
pre-commit>=4.2.0
accelerate>=1.9.0
sglang>=0.4.10.post1
git+https://github.com/vllm-project/vllm.git
# using with pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly to use streaming tool call support
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment