update

a851dd99 · dengjb · a851dd99 · a851dd99 · a851dd99 · a851dd99
Commit a851dd99 authored Aug 07, 2025 by dengjb
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+.idea
+__pycache__/
+*.py[codz]
+*$py.class
+.DS_Store
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+default_stages:
+  - pre-commit # Run locally
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+  - id: ruff
+    args: [--output-format, github, --fix, --select, I]
+  - id: ruff-format
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
+  hooks:
+  - id: typos
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.29
+  hooks:
+  - id: pymarkdown
+    args: [fix]
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 Zhipu AI
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# Step3
+## 论文
+`
+GLM-4.5: Reasoning, Coding, and Agentic Abililties
+`
+- https://z.ai/blog/glm-4.5
+## 模型结构
+在 GLM-4.5 系列模型中，我们采用了 MoE（专家混合）架构，这种架构能够显著提升训练和推理时的计算效率。我们在 MoE 层采用了 loss-free balance 路由和 sigmoid gate 机制。
+<div align=center>
+    <img src="./asserts/model.png"/>
+</div>
+## 算法原理
+我们的基础模型经历了几个训练阶段。在预训练期间，模型首先在 15T token 的通用预训练语料库上训练，然后在 7T token 的代码和推理语料库上训练。预训练后，我们引入了 Mid-Training 阶段来进一步提升模型在专有领域上的性能。
+<div align=center>
+    <img src="./asserts/image.png"/>
+</div>
+## 环境配置
+### 硬件需求
+DCU型号：BW1000，节点数量：2 台，卡数：2*8 张。
+### 通信配置
+一、节点间基础通信
+`在本地机器上配置以下内容：`
+1、关闭防火墙：
+```
+systemctl stop  firewalld # 若为centos 
+ufw disable # 若为Ubuntu
+```
+2、设置amd_iommu=on:
+```
+vim /etc/default/grub
+```
+<div align=center>
+    <img src="./figures/amd_iommu.png"/>
+</div>
+更新下配置:
+```
+grub2-mkconfig -o /boot/efi/EFI/rocky/grub.cfg
+```
+重启机器后校验是否生效(检查是否存在imxxx=pt)：
+```
+BOOT_IMAGE=(hd0,gpt3)/vmlinuz-4.18.0-372.9.1.el8.x86_64 root=UUID=80974f58-7d23-49bb-bd8b-8e299eb0d188 ro crashkernel=auto rhgb quiet systemd.unified_cgroup_hierachy=1 systemd.unified_cgroup_hierarchy=1 amd_iommu=on iommu=pt
+```
+`在后面步骤启动的容器里面配置以下内容：`
+```
+apt update
+apt install openssh-server -y
+```
+vim /etc/ssh/sshd_config # 修改下面PermitRootLogin为yes
+```
+# 取消以下4句命令的注释
+RSAAuthentication yes #启用 RSA 认证
+PubkeyAuthentication yes #启用公钥私钥配对认证方式
+AuthorizedKeysFile ~/.ssh/authorized_keys #公钥文件路径（和下面生成的文件同）
+PermitRootLogin yes #root能使用ssh登录
+```
+重启ssh服务，并设置开机启动：
+```
+service sshd restart
+chkconfig sshd on
+查看sshd状态：service ssh status
+开启sshd服务：/etc/init.d/ssh restart
+```
+下面开始设置节点间免密通信的秘钥：
+1、ssh-keygen生成秘钥
+```
+ssh-keygen -t ed25519 # 此处以ed25519为例，读者可自己设置为其它名字，遇到提问全部回车键确认
+```
+2、将需要使用的各个节点`~/.ssh/authorized_keys`里的秘钥收集复制到`~/.ssh/id_rsa.pub`，每个节点`~/.ssh/id_rsa.pub`里的所有秘钥最终一致。格式类似如下：
+<div align=center>
+    <img src="./figures/id_rsa.png"/>
+</div>
+3、设置节点间的通信端口号
+```
+/usr/sbin/sshd -p 10085 # 不同节点可以设置不同的端口号，打通秘钥和端口号之后可以用ssh -p之类的命令验证节点间是否通信已经通畅，否则需检查前面步骤是否设置成功。
+```
+以上设置非标准步骤，不同服务器或集群存在明显差异，无法完全复制此过程，请读者根据自己机器的实际情况灵活采用，总体目标是开启amd_iommu、打通节点间的容器内可以直接免密登录。
+二、ray相关通信
+`在后面步骤启动的容器里面配置以下内容：`
+```
+vim ~/.bashrc
+```
+在脚本`.bashrc`最后面添加以下命令（以BW千卡集群为例）：
+```
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export VLLM_HOST_IP=x.x.x.x
+export NCCL_SOCKET_IFNAME=ib0
+export GLOO_SOCKET_IFNAME=ib0
+unset NCCL_ALGO
+export NCCL_MIN_NCHANNELS=16
+export NCCL_MAX_NCHANNELS=16 
+export NCCL_NET_GDR_READ=1
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export LMSLIM_USE_LIGHTOP=0
+#针对hycpu环境建议设置绑核操作：
+export VLLM_NUMA_BIND=1
+export VLLM_RANK0_NUMA=0
+export VLLM_RANK1_NUMA=1
+export VLLM_RANK2_NUMA=2
+export VLLM_RANK3_NUMA=3
+export VLLM_RANK4_NUMA=4
+export VLLM_RANK5_NUMA=5
+export VLLM_RANK6_NUMA=6
+export VLLM_RANK7_NUMA=7
+#BW集群需要额外设置的环境变量：
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="topo-input.xml"
+# 若为K100_AI卡，则添加以下信息(本步骤以BW卡为示例，故注释了以下信息。)：
+# export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44 
+```
+其中`VLLM_HOST_IP`和`NCCL_SOCKET_IFNAME`需要替换成每个自己机器上查到的信息，每个节点的ip不同，查询方式如下：
+```
+通信口和ip查询方法：ifconfig
+VLLM_HOST_IP： 节点本地通信口ip
+NCCL_SOCKET_IFNAME和GLOO_SOCKET_IFNAME： 节点本地通信网口名
+```
+`示例：`
+<div align=center>
+    <img src="./figures/ip.png"/>
+</div>
+带BW卡的集群VLLM_HOST_IP需要设置为ib网卡对应的IP，避免出现rccl超时问题：
+<div align=center>
+    <img src="./figures/ip_bw.png"/>
+</div>
+注意：添加完以上信息后需要激活环境变量
+```
+source ~/.bashrc
+```
+`Tips：由于通信配置方面属于运维人员的专业内容，以上关于通信的配置建议读者联系运维人员进行配置。`
+### Docker（方法一）
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:vllm-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250802-step3
+docker run -it --name glm4.5_vllm --shm-size=1024G  --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v $PWD/Step3_pytorch:/home/Step3_pytorch f0e4191089de /bin/bash
+wget --content-disposition 'https://download.sourcefind.cn:65024/file/4/triton/DAS1.6/triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl'
+pip install triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
+```
+### Dockerfile（方法二）
+```
+cd $PWD/Step3_pytorch/docker
+docker build --no-cache -t glm4.5:latest .
+docker run -it --name glm4.5_vllm --shm-size=1024G  --device=/dev/kfd --device=/dev/dri/ --privileged --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v /opt/hyhal:/opt/hyhal:ro -v $PWD/Step3_pytorch:/home/Step3_pytorch f0e4191089de /bin/bash
+wget --content-disposition 'https://download.sourcefind.cn:65024/file/4/triton/DAS1.6/triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl'
+pip install triton-3.0.0+das.opt1.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
+```
+## 数据集
+`无`
+## 训练
+`无`
+## 推理
+预训练权重目录结构，将后续模型地址切换成实际模型权重地址：
+```
+/home/glm4.5_pytorch/
+    └── zai-org/GLM-4.5
+```
+### 多机多卡
+启动ray集群
+```
+# 启动ray
+# 启动主节点的ray, x.x.x.x 为前面步骤中ifconfig查到的主节点ip（VLLM_HOST_IP），--port为端口号，可以随意设置，保持主节点和其余节点端口号一致。
+ray start --head --node-ip-address=x.x.x.x --port=6379 --num-gpus=8 --num-cpus=32 
+# 启动其它节点的ray,注意 x.x.x.x 为前面步骤中ifconfig查到的主节点ip（VLLM_HOST_IP）。
+ray start --address='x.x.x.x:6379' --num-gpus=8 --num-cpus=32
+# 可用ray status 查看ray的集群启动状态。
+```
+vLLM Deployment(vllm官方暂不支持AFD，只支持非分离模式部署):
+```
+#head节点执行
+VLLM_USE_NN=0 VLLM_USE_FLASH_ATTN_PA=0 vllm serve /path/to/GLM4.5 \
+    --reasoning-parser glm4_moe \
+    --enable-auto-tool-choice \
+    --tool-call-parser glm4_moe \
+    --trust-remote-code \
+    --max-num-batched-tokens 4096 \
+    --distributed-executor-backend ray  \
+    --dtype float16  \
+    -tp 16 \
+    --port $PORT_SERVING 
+```
+- 客户端请求
+```bash
+python inference/api_request.py
+```
+更多资料可参考源项目中的[`README_zh`](./README_zh.md)。
+## result
+example1:
+- text: 请帮我查询一下北京的天气。
+<div align=center>
+    <img src="./asserts/example.png"/>
+</div>
+- 输出结果:
+<div align=center>
+    <img src="./asserts/results.png"/>
+</div>
+### 精度
+DCU与GPU精度一致，推理框架：vllm。
+## 应用场景
+### 算法类别
+`对话问答`
+### 热点应用行业
+`电商,教育,广媒`
+## 预训练权重
+huggingface权重下载地址为：
+- [zai-org/GLM-4.5](https://huggingface.co/zai-org/GLM-4.5)
+`注：建议加镜像源下载：export HF_ENDPOINT=https://hf-mirror.com`
+## 源码仓库及问题反馈
+- https://developer.sourcefind.cn/codes/modelzoo/glm4.5_pytorch
+## 参考资料
+- https://github.com/zai-org/GLM-4.5/tree/main
\ No newline at end of file
--- a/README_zh.md
+++ b/README_zh.md
+# GLM-4.5
+[English Version](./README.md)
+<div align="center">
+<img src=resources/logo.svg width="15%"/>
+</div>
+<p align="center">
+    👋 加入我们的<a href="resources/WECHAT.md" target="_blank"> 微信群 </a>或<a href="https://discord.gg/QR7SARHRxK" target="_blank"> Discord </a>社区。
+    <br>
+    📖 查看GLM-4.5<a href="https://z.ai/blog/glm-4.5" target="_blank"> 技术博客 </a> 以及 <a href="https://zhipu-ai.feishu.cn/wiki/Gv3swM0Yci7w7Zke9E0crhU7n7D" target="_blank"> 智谱AI技术文档 </a>。
+    <br>
+    📍 在<a href="https://docs.bigmodel.cn/cn/guide/models/text/glm-4.5"> 智谱AI开放平台 </a>上使用GLM-4.5 API服务。
+    <br>
+    👉 一键体验 <a href="https://chat.z.ai" >GLM-4.5 </a>。
+</p>
+## 模型介绍
+**GLM-4.5** 系列模型是专为智能体设计的基础模型。GLM-4.5拥有 **3550** 亿总参数量，其中 **320** 亿活跃参数；GLM-4.5-Air 采用更紧凑的设计，拥有
+ **1060** 亿总参数量，其中 **120** 亿活跃参数。GLM-4.5模型统一了推理、编码和智能体能力，以满足智能体应用的复杂需求。
+GLM-4.5 和 GLM-4.5-Air 都是混合推理模型，提供两种模式：用于复杂推理和工具使用的思考模式，以及用于即时响应的非思考模式。
+我们已开源了 GLM-4.5 和 GLM-4.5-Air 的基础模型、混合推理模型以及混合推理模型的FP8版本。它们采用MIT开源许可证发布，可用于商业用途和二次开发。
+在我们对12项行业标准基准的全面评估中，GLM-4.5表现卓越，得分 **63.2**，在所有专有和开源模型中排名**第3**
+。值得注意的是，GLM-4.5-Air在保持优异效率的同时，仍取得了 **59.8** 的竞争性成绩。
+![bench](resources/bench.png)
+如需了解更多评估结果、展示案例和技术细节，请访问我们的 [技术博客](https://z.ai/blog/glm-4.5)。技术报告将很快发布。
+模型代码、工具解析器和推理解析器可在 [transformers](https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4_moe)、 [vLLM](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/glm4_moe_mtp.py)
+和 [SGLang](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/glm4_moe.py) 的找到具体实现。
+## 模型下载
+你可以在 [Hugging Face](https://huggingface.co/spaces/zai-org/GLM-4.5-Space)
+或 [ModelScope](https://modelscope.cn/studios/ZhipuAI/GLM-4.5-Demo) 上直接体验模型，也可以按照下面的链接下载模型。
+| 模型               | 下载链接                                                                                                                                          | 模型大小      | 精度   |
+|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|-----------|------|
+| GLM-4.5          | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5)                   | 355B-A32B | BF16 |
+| GLM-4.5-Air      | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air)           | 106B-A12B | BF16 |
+| GLM-4.5-FP8      | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-FP8)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-FP8)           | 355B-A32B | FP8  |
+| GLM-4.5-Air-FP8  | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air-FP8)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-FP8)   | 106B-A12B | FP8  |
+| GLM-4.5-Base     | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Base)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Base)         | 355B-A32B | BF16 |
+| GLM-4.5-Air-Base | [🤗 Hugging Face](https://huggingface.co/zai-org/GLM-4.5-Air-Base)<br> [🤖 ModelScope](https://modelscope.cn/models/ZhipuAI/GLM-4.5-Air-Base) | 106B-A12B | BF16 |
+## 系统要求
+### 推理
+我们提供了"全功能"模型推理的最低和推荐配置。下表中的数据基于以下条件：
+1. 所有模型都使用MTP层，并指定`--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4`
+   以确保具有竞争力的推理速度。
+2. 不使用 `cpu-offload` 参数。
+3. 推理批处理大小不超过 `8`。
+4. 所有操作都在原生支持FP8推理的设备上执行，确保权重和缓存都采用FP8格式。
+5. 服务器内存必须超过 `1T` 以确保正常的模型加载和运行。
+模型可在下表配置下运行：
+| 模型          | 精度   | GPU类型和数量             | 测试框架   |
+|-------------|------|----------------------|--------|
+| GLM-4.5     | BF16 | H100 x 16 / H200 x 8 | sglang |
+| GLM-4.5     | FP8  | H100 x 8 / H200 x 4  | sglang |
+| GLM-4.5-Air | BF16 | H100 x 4 / H200 x 2  | sglang |
+| GLM-4.5-Air | FP8  | H100 x 2 / H200 x 1  | sglang |
+在下表配置下，模型可以充分利用其128K上下文长度：
+| 模型          | 精度   | GPU类型和数量              | 测试框架   |
+|-------------|------|-----------------------|--------|
+| GLM-4.5     | BF16 | H100 x 32 / H200 x 16 | sglang |
+| GLM-4.5     | FP8  | H100 x 16 / H200 x 8  | sglang |
+| GLM-4.5-Air | BF16 | H100 x 8 / H200 x 4   | sglang |
+| GLM-4.5-Air | FP8  | H100 x 4 / H200 x 2   | sglang |
+### 微调
+使用 [Llama Factory](https://github.com/hiyouga/LLaMA-Factory) 框架，代码可在下表配置下运行：
+| 模型          | GPU类型和数量  | 策略   | 批处理大小（每GPU） |
+|-------------|-----------|------|-------------|
+| GLM-4.5     | H100 x 16 | Lora | 1           |
+| GLM-4.5-Air | H100 x 4  | Lora | 1           |
+使用 [Swift](https://github.com/modelscope/ms-swift) 框架，代码可在下表配置下运行：
+| 模型          | GPU类型和数量          | 策略   | 批处理大小（每GPU） |
+|-------------|-------------------|------|-------------|
+| GLM-4.5     | H20 (96GiB) x 16  | Lora | 1           |
+| GLM-4.5-Air | H20 (96GiB) x 4   | Lora | 1           |
+| GLM-4.5     | H20 (96GiB) x 128 | SFT  | 1           |
+| GLM-4.5-Air | H20 (96GiB) x 32  | SFT  | 1           |
+| GLM-4.5     | H20 (96GiB) x 128 | RL   | 1           |
+| GLM-4.5-Air | H20 (96GiB) x 32  | RL   | 1           |
+## 快速开始
+请根据`requirements.txt`安装所需的包。
+```shell
+pip install -r requirements.txt
+```
+### transformers
+请参考 `inference` 文件夹中的 `trans_infer_cli.py` 代码。
+### vLLM
+ BF16和FP8都可以用以下代码启动：
+```shell
+vllm serve zai-org/GLM-4.5-Air \
+    --tensor-parallel-size 8 \
+    --tool-call-parser glm45 \
+    --reasoning-parser glm45 \
+    --enable-auto-tool-choice \
+    --served-model-name glm-4.5-air
+```
+如果您使用8x H100 GPU并且在运行GLM-4.5模型时遇到内存不足的问题，您需要使用`--cpu-offload-gb 16`（仅适用于vLLM）。
+如果遇到`flash infer`问题，请使用`VLLM_ATTENTION_BACKEND=XFORMERS`作为临时替代方案。您也可以指定
+`TORCH_CUDA_ARCH_LIST='9.0+PTX'`来使用`flash infer`（不同GPU有不同的TORCH_CUDA_ARCH_LIST值，请相应检查）。
+### SGLang
+ BF16
+```shell
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.5-Air \
+  --tp-size 8 \
+  --tool-call-parser glm45  \
+  --reasoning-parser glm45 \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --mem-fraction-static 0.7 \
+  --served-model-name glm-4.5-air \
+  --host 0.0.0.0 \
+  --port 8000
+```
+ FP8
+```shell
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.5-Air-FP8 \
+  --tp-size 4 \
+  --tool-call-parser glm45  \
+  --reasoning-parser glm45  \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 3  \
+  --speculative-eagle-topk 1  \
+  --speculative-num-draft-tokens 4 \
+  --mem-fraction-static 0.7 \
+  --disable-shared-experts-fusion \
+  --served-model-name glm-4.5-air-fp8 \
+  --host 0.0.0.0 \
+  --port 8000
+```
+### 请求参数说明
+ 使用`vLLM`和`SGLang`时，发送请求时默认启用思考模式。如果要禁用思考开关，需要添加
+  `extra_body={"chat_template_kwargs": {"enable_thinking": False}}`参数。
+ 两者都支持工具调用。请使用OpenAI风格的工具描述格式进行调用。
+ 具体代码请参考`inference`文件夹中的`api_request.py`。
--- a/asserts/example.png
+++ b/asserts/example.png
--- a/asserts/image.png
+++ b/asserts/image.png
--- a/asserts/model.png
+++ b/asserts/model.png
--- a/asserts/results.png
+++ b/asserts/results.png
--- a/claude_code/README.md
+++ b/claude_code/README.md
+# Setting up Claude Code Service with SGLang + GLM-4.5 Model
+[中文阅读](./README_zh.md)
+## Installation
+You need to have a local computer device for programming and a server for running the `GLM-4.5` model.
+### Local Device
+Ensure you have installed [Claude Code](https://github.com/anthropics/claude-code)
+and [Claude Code Router](https://github.com/musistudio/claude-code-router).
+```
+npm install -g @anthropic-ai/claude-code
+npm install -g @musistudio/claude-code-router
+```
+### Server
+Ensure you have installed `sglang` on your server.
+```shell
+pip install sglang
+```
+And start the model service with the following command:
+```shell
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.5 \
+  --tp-size 16 \
+  --tool-call-parser glm45  \
+  --reasoning-parser glm45 \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --mem-fraction-static 0.7 \
+  --served-model-name glm-4.5 \
+  --port 8000 \
+  --host 0.0.0.0 # Or your server's internal/public IP address
+```
+When successful, you will see output similar to the following:
+```
+[2025-07-26 16:09:07] INFO:     Started server process [80269]
+[2025-07-26 16:09:07] INFO:     Waiting for application startup.
+[2025-07-26 16:09:07] INFO:     Application startup complete.
+[2025-07-26 16:09:07] INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+[2025-07-26 16:09:08] INFO:     127.0.0.1:57722 - "GET /get_model_info HTTP/1.1" 200 OK
+[2025-07-26 16:09:11] INFO:     127.0.0.1:57732 - "POST /generate HTTP/1.1" 200 OK
+[2025-07-26 16:09:11] The server is fired up and ready to roll!
+```
+Please ensure your server's IP can be accessed from the device where Claude Code and Claude Code Router are installed.
+## Configuration
+1. Modify the `api_base_url` in `config.example.json` to your server's accessible address.
+2. Rename it to `config.json`
+3. Copy it to `~/.claude-code-router/config.json`.
+4. Execute the `ccr restart` command in the command line. You should see the following output:
+    ```
+    Service was not running or failed to stop.
+    Starting claude code router service...
+    ✅ Service started successfully in the background.
+    ```
+## Running
+Execute the following command:
+```
+ccr code
+```
+It should looks like:
+```
+zr@MacBook GLM-4.5 % ccr code
+Service not running, starting service...
+╭───────────────────────────────────────────────────╮
+│ ✻ Welcome to Claude Code!                         │
+│                                                   │
+│   /help for help, /status for your current setup  │
+│                                                   │
+│   cwd: /Users/zr/Code/GLM-4.5                     │
+│                                                   │
+│   ─────────────────────────────────────────────── │
+│                                                   │
+│   Overrides (via env):                            │
+│                                                   │
+│   • API timeout: 600000ms                         │
+│   • API Base URL: http://127.0.0.1:3456           │
+╰───────────────────────────────────────────────────╯
+ ※ Tip: Press Esc twice to edit your previous messages
+> how can I run GLM-4.5 in transformers
+⏺ I'll help you understand how to run GLM-4.5 in transformers. Let me first explore the codebase to see what's available.
+⏺ List(.)
+  ⎿  Listed 21 paths (ctrl+r to expand)
+⏺ Read(README.md)
+  ⎿  Read 4 lines (ctrl+r to expand)
+⏺ Read(inference/trans_infer_cli.py)
+  ⎿  Read 29 lines (ctrl+r to expand)
+⏺ Read(requirements.txt)
+  ⎿  Read 5 lines (ctrl+r to expand)
+⏺ You can run GLM-4.5 in transformers using the example script in inference/trans_infer_cli.py. Here's the basic approach:
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  import torch
+  MODEL_PATH = "zai-org/GLM-4.5"
+  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+  model = AutoModelForCausalLM.from_pretrained(
+      MODEL_PATH,
+      torch_dtype=torch.bfloat16,
+      device_map="auto"
+  )
+  Then use the tokenizer's chat template for inference.
+╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ >                                                                                                                                                                                     │
+╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ? for shortcuts
+```
--- a/claude_code/README_zh.md
+++ b/claude_code/README_zh.md
+# 使用 SGLang + GLM-4.5 模型搭建Claude Code服务
+[Read this in English.](./README.md)
+## 安装
+你需要拥有一台本地的电脑设备，这是你的编程设备，和一台服务器用于运行`GLM-4.5`模型。
+### 本地设备
+确保您已安装 [Claude Code](https://github.com/anthropics/claude-code)
+和 [Claude Code Router](https://github.com/musistudio/claude-code-router)。
+```
+npm install -g @anthropic-ai/claude-code
+npm install -g @musistudio/claude-code-router
+```
+### 服务器
+确保你再服务器上已安装`sglang`。
+```shell
+pip install sglang
+```
+并使用如下命令启动模型服务:
+```shell
+python3 -m sglang.launch_server \
+  --model-path zai-org/GLM-4.5 \
+  --tp-size 16 \
+  --tool-call-parser glm45  \
+  --reasoning-parser glm45 \
+  --speculative-algorithm EAGLE \
+  --speculative-num-steps 3 \
+  --speculative-eagle-topk 1 \
+  --speculative-num-draft-tokens 4 \
+  --mem-fraction-static 0.7 \
+  --served-model-name glm-4.5 \
+  --port 8000 \
+  --host 0.0.0.0 # 或者你服务器的内网/公网IP地址
+```
+运行成功时， 你将会看到类似如下输出:
+```
+[2025-07-26 16:09:07] INFO:     Started server process [80269]
+[2025-07-26 16:09:07] INFO:     Waiting for application startup.
+[2025-07-26 16:09:07] INFO:     Application startup complete.
+[2025-07-26 16:09:07] INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+[2025-07-26 16:09:08] INFO:     127.0.0.1:57722 - "GET /get_model_info HTTP/1.1" 200 OK
+[2025-07-26 16:09:08 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0, 
+[2025-07-26 16:09:11] INFO:     127.0.0.1:57732 - "POST /generate HTTP/1.1" 200 OK
+[2025-07-26 16:09:11] The server is fired up and ready to roll!
+```
+请确保你的服务器的IP能被你 Claude Code 和 Claude Code Router 安装的设备上访问。
+## 配置
+1. 将 `config.example.json` 中的 `api_base_url` 修改为服务器的可访问地址。
+2. 重命名为 `config.json`
+3. 复制到 `~/.claude-code-router/config.json` 中。
+4. 再命令行执行`ccr restart` 命令。得到如下输出
+```
+Service was not running or failed to stop.
+Starting claude code router service...
+✅ Service started successfully in the background.
+```
+## 运行
+执行以下命令:
+```
+ccr code
+```
+即可正常运行，效果如下:
+```
+zr@MacBook GLM-4.5 % ccr code
+Service not running, starting service...
+╭───────────────────────────────────────────────────╮
+│ ✻ Welcome to Claude Code!                         │
+│                                                   │
+│   /help for help, /status for your current setup  │
+│                                                   │
+│   cwd: /Users/zr/Code/GLM-4.5                     │
+│                                                   │
+│   ─────────────────────────────────────────────── │
+│                                                   │
+│   Overrides (via env):                            │
+│                                                   │
+│   • API timeout: 600000ms                         │
+│   • API Base URL: http://127.0.0.1:3456           │
+╰───────────────────────────────────────────────────╯
+ ※ Tip: Press Esc twice to edit your previous messages
+> how can I run GLM-4.5 in transformers
+⏺ I'll help you understand how to run GLM-4.5 in transformers. Let me first explore the codebase to see what's available.
+⏺ List(.)
+  ⎿  Listed 21 paths (ctrl+r to expand)
+⏺ Read(README.md)
+  ⎿  Read 4 lines (ctrl+r to expand)
+⏺ Read(inference/trans_infer_cli.py)
+  ⎿  Read 29 lines (ctrl+r to expand)
+⏺ Read(requirements.txt)
+  ⎿  Read 5 lines (ctrl+r to expand)
+⏺ You can run GLM-4.5 in transformers using the example script in inference/trans_infer_cli.py. Here's the basic approach:
+  from transformers import AutoModelForCausalLM, AutoTokenizer
+  import torch
+  MODEL_PATH = "zai-org/GLM-4.5"
+  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+  model = AutoModelForCausalLM.from_pretrained(
+      MODEL_PATH,
+      torch_dtype=torch.bfloat16,
+      device_map="auto"
+  )
+  Then use the tokenizer's chat template for inference.
+╭───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ >                                                                                                                                                                                     │
+╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+  ? for shortcuts
+```
--- a/claude_code/config.example.json
+++ b/claude_code/config.example.json
+{
+  "LOG": true,
+  "Providers": [
+    {
+      "name": "glm-4.5-sglang",
+      "api_base_url": "http://127.0.0.1:8000/v1/chat/completions",
+      "api_key": "EMPTY",
+      "models": [
+        "glm-4.5"
+      ]
+    }
+  ],
+  "Router": {
+    "default": "glm-4.5-sglang,glm-4.5",
+    "background": "glm-4.5-sglang,glm-4.5",
+    "think": "glm-4.5-sglang,glm-4.5",
+    "longContext": "glm-4.5-sglang,glm-4.5",
+    "webSearch": "glm-4.5-sglang,glm-4.5"
+  }
+}
\ No newline at end of file
--- a/figures/amd_iommu.png
+++ b/figures/amd_iommu.png
--- a/figures/bee.jpg
+++ b/figures/bee.jpg
--- a/figures/id_rsa.png
+++ b/figures/id_rsa.png
--- a/figures/ip.png
+++ b/figures/ip.png
--- a/figures/ip_bw.png
+++ b/figures/ip_bw.png
--- a/inference/api_request.py
+++ b/inference/api_request.py
+from openai import OpenAI
+openai_api_key = "EMPTY"
+openai_api_base = "http://127.0.0.1:8000/v1"
+import json
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get current temperature for a given location.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "City and country e.g. Bogotá, Colombia",
+                    }
+                },
+                "required": ["location"],
+                "additionalProperties": False,
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "return_delivered_order_items",
+            "description": "Return items from a delivered order",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "order_id": {
+                        "type": "string",
+                        "description": "The order ID, e.g. #W4794911",
+                    },
+                    "item_ids": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of item IDs to return",
+                    },
+                    "payment_method_id": {
+                        "type": "string",
+                        "description": "Payment method ID for processing the return, e.g. paypal_7503218",
+                    },
+                },
+                "required": ["order_id", "item_ids", "payment_method_id"],
+                "additionalProperties": False,
+            },
+        },
+    },
+]
+tools_messages = [
+    {
+        "role": "tool",
+        "tool_call_id": "tool-call-bf208d1d-9b5f-407f-8c6e-c35e54aa2fef",
+        "content": '{"city": "北京", "date": "2024-06-27", "weather": "晴", "temperature": "26C"}',
+    },
+]
+messages = [
+    {"role": "system", "content": "请你调用工具，用中文回答问题。"},
+    {"role": "user", "content": "请帮我查询一下北京的天气。"},
+]
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+completion = client.chat.completions.create(
+    model="zai-org/GLM-4.5",
+    messages=messages,
+    max_tokens=4096,
+    temperature=0.0,
+    # extra_body={"chat_template_kwargs": {"enable_thinking": False}} # Uncomment this line to disable thinking
+)
+tool_call = completion.choices[0].message.tool_calls[0]
+args = json.loads(tool_call.function.arguments)
+print("===== TOOL CALL =====")
+print(tool_call)
+messages.append(completion.choices[0].message)
+messages.append(tools_messages[0])
+## This part is to simulate the tool response
+completion_2 = client.chat.completions.create(
+    model="zai-org/GLM-4.5",
+    messages=messages,
+    tools=tools,
+    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
+)
+print("===== RESPONSE =====")
+print(completion_2.choices[0].message.content)
--- a/inference/trans_infer_cli.py
+++ b/inference/trans_infer_cli.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_PATH = "zai-org/GLM-4.5"
+messages = [{"role": "user", "content": "你好"}]
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+inputs = tokenizer.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    pretrained_model_name_or_path=MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+inputs = inputs.to(model.device)
+generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
+output_text = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1] :])
+print(output_text)
--- a/requirements.txt
+++ b/requirements.txt
+transformers>=4.54.0
+pre-commit>=4.2.0
+accelerate>=1.9.0
+sglang>=0.4.10.post1
+git+https://github.com/vllm-project/vllm.git
+# using with pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly to use streaming tool call support
\ No newline at end of file