Add sglang

85ca2fc1 · chenych · fd1ada03 · 85ca2fc1 · 85ca2fc1
Commit 85ca2fc1 authored Apr 21, 2026 by chenych
Hide whitespace changes
Inline Side-by-side

Showing with 57 additions and 5 deletions

README.md README.md +56 -4

model.properties model.properties +1 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -11,12 +11,16 @@
 |     DTK      | 26.04 |
 |    python    | 3.10.12 |
 | transformers |  5.2.0  |
-|    torch     |  2.9.0+das.opt1.dtk2604.20260331.g4e3c1e7  |
-|     vllm     |  0.15.1+das.opt1.alpha.dtk2604.torch290.2604042155.gba9f96  |
+|    torch     |  2.9.0  |
+|     vllm     |  0.15.1  |
+|    sglang    |  0.5.10rc0  |

-当前仅支持镜像: harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm015-ubuntu22.04-dtk26.04-glm5-0408
+当前仅支持镜像:
+- **vLLM推理请使用:** harbor.sourcefind.cn:5443/dcu/admin/base/custom:vllm015-ubuntu22.04-dtk26.04-glm5-0408
+- **SGLang推理请使用:** harbor.sourcefind.cn:5443/dcu/admin/base/custom:sglang-0.5.10-glm5-0416

 - 挂载地址`-v`根据实际模型情况修改
+- 下面以`vLLM`镜像启动示例，如果使用`SGLang`，请对应替换镜像地址

 ```bash
 docker run -it \
@@ -47,6 +51,55 @@ docker run -it \
 ## 推理
 > 如果出现`ImportError: librocm_smi64.so.2: cannot open shaned object file: No such file or directory`报错，系机器hyhal版本较低所致，请进行升级。

+### SGLang
+1. 加入环境变量
+```bash
+export SGLANG_USE_LIGHTOP=1
+export HIP_GRAPH_USE_CMD_CACHE=0
+export SGLANG_ROCM_USE_AITER_MOE=0
+```
+
+2. 启动服务
+```bash
+model_path=ZhipuAI/GLM-5-FP8
+
+option="--numa-node 0 0 0 0 1 1 1 1 "
+option+=" --disable-radix-cache "
+option+=" --chunked-prefill-size 16384"
+option+=" --page-size 64 "
+option+=" --nsa-prefill-backend flashmla_auto --nsa-decode-backend flashmla_kv "
+# option+=" --quantization slimquant_marlin "
+
+python3 -m sglang.launch_server --model-path "${model_path}" ${option} \
+                                --trust-remote-code \
+                                --reasoning-parser glm45 \
+                                --tool-call-parser glm47 \
+                                --kv-cache-dtype fp8_e4m3 \
+                                --dtype bfloat16 \
+                                --mem-fraction-static 0.925 \
+                                --host 0.0.0.0 \
+                                --port 8001 \
+                                --tp-size 8 \
+                                --context-length 32768 \
+                                --served-model-name glm-5-fp8
+```
+
+3. 启动完成后可通过以下方式访问：
+```bash
+curl http://localhost:8001/v1/chat/completions   \
+    -H "Content-Type: application/json"  \
+    -d '{
+        "model": "glm-5-fp8",
+        "messages": [
+          {"role": "system", "content": "You are a helpful assistant."},
+          {"role": "user", "content": "What is 15% of 240?"}
+        ],
+        "max_tokens": 2048,
+        "temperature": 0.7,
+        "chat_template_kwargs": {"enable_thinking": false}
+    }'
+```
+
 ### vllm
 #### 单机推理
 1. 加入环境变量
@@ -75,7 +128,6 @@ export VLLM_USE_OPT_CAT=1
 export VLLM_USE_FUSED_FILL_RMS_CAT=1
 export VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD=0
 export VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT=0
-export VLLM_USE_V32_ENCODE=1
 export VLLM_USE_FLASH_MLA=1
 export VLLM_DISABLE_DSA=0
 export USE_LIGHTOP_TOPK=1

--- a/model.properties
+++ b/model.properties
@@ -9,6 +9,6 @@ processType=推理
 # 算法类别
 appCategory=对话问答
 # 框架类型
-frameType=vllm
+frameType=vllm,sglang
 # 加速卡类型
 accelerateType=BW1000,BW1100
\ No newline at end of file