Fix bugs and complete vllm serve

024e6d37 · chenych · 590059ff · 024e6d37 · 590059ff · 024e6d37
Commit 024e6d37 authored Oct 03, 2025 by chenych
5 changed files
--- a/README.md
+++ b/README.md
@@ -21,16 +21,10 @@ DCU型号：K100AI,节点数量：4台,卡数：32 张。
 ### Docker（方法一）
 ```bash
-dcoker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
+dcoker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
-## 安装transformers
-git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
 cd /your_code_path/deepseek-v3.2-exp_pytorch
 ```
 ### Dockerfile（方法二）
@@ -40,13 +34,7 @@ docker build --no-cache -t deepseek-v3.2-exp:latest .
 docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
-## 安装transformers
-git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
-cd transformers
-pip install -e .
 cd /your_code_path/deepseek-v3.2-exp_pytorch
 ```
 ### Anaconda（方法三）
@@ -55,16 +43,12 @@ cd /your_code_path/deepseek-v3.2-exp_pytorch
 DTK: 25.04.1
 python: 3.10.12
 torch: 2.5.1+das.opt1.dtk25041
+transformers: 4.56.1
 ```
-`Tips：以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其他组件安装方法如下：
+`Tips：以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`，其他包安装如下：
 ```bash
-## 安装transformers
+http://10.16.4.1:8000/customized/vllm/dtk25.04.1/v0.9.2-dsv32/v0.9.2-dsv32-cf360956/vllm-0.9.2%2Bdas.opt1.rc2.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
-git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
+pip install vllm-0.9.2+das.opt1.rc2.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
-cd transformers
-pip install -e .
-cd /your_code_path/deepseek-v3.2-exp_pytorch
 ```
 ## 数据集
@@ -76,16 +60,14 @@ cd /your_code_path/deepseek-v3.2-exp_pytorch
 ## 推理
 样例模型：[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)
-首先将模型转换成bf16格式，转换完成后，将原模型中的 `config.json`, `generation_config.json`, `tokenizer_config.json`, `tokenizer.json`拷贝到`/path/to/DeepSeek-V3.2-Exp-bf16`中，并删掉`config.json`中的`quantization_config`字段，如下图所示。
+首先将模型转换成bf16格式，转换命令如下：
-<div align=center>
-    <img src="./doc/config.png"/>
-</div>
 ```bash
 # fp8转bf16
 python inference/fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp --output-bf16-hf-path /path/to/DeepSeek-V3.2-Exp-bf16
+# 拷贝config文件
+cp inference/config.json /path/to/DeepSeek-V3.2-Exp-bf16
 ```
+转换完成后，将原模型中的 `generation_config.json`, `tokenizer_config.json`, `tokenizer.json`拷贝到`/path/to/DeepSeek-V3.2-Exp-bf16`中。
 ### vllm推理方法
 #### server 多机
@@ -108,16 +90,26 @@ python inference/fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp
 ```bash
 export ALLREDUCE_STREAM_WITH_COMPUTE=1
 export VLLM_HOST_IP=x.x.x.x # 对应计算节点的IP，建议选择IB口SOCKET_IFNAME对应IP地址
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HSA_FORCE_FINE_GRAIN_PCIE=1
 export NCCL_SOCKET_IFNAME=ibxxxx
 export GLOO_SOCKET_IFNAME=ibxxxx
-export NCCL_IB_HCA=mlx5_0:1 # 环境中的IB网卡名字
+export NCCL_IB_HCA=mlx5_0:1
 unset NCCL_ALGO
-export NCCL_MIN_NCHANNELS=16
+export NCCL_IB_DISABLE=0
 export NCCL_MAX_NCHANNELS=16
+export NCCL_MIN_NCHANNELS=16
 export NCCL_NET_GDR_READ=1
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export NCCL_DEBUG=INFO
+export NCCL_MIN_P2P_NCHANNELS=16
+export NCCL_NCHANNELS_PER_PEER=16
+export HIP_USE_GRAPH_QUEUE_POOL=1
+export VLLM_ENABLE_MOE_FUSED_GATE=0
+export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44
+export VLLM_RPC_TIMEOUT=1800000
+export VLLM_USE_FLASH_MLA=1
-# 海光CPU绑定核
+# 海光CPU绑定核,intel cpu可不加
 export VLLM_NUMA_BIND=1
 export VLLM_RANK0_NUMA=0
 export VLLM_RANK1_NUMA=1
@@ -127,10 +119,10 @@ export VLLM_RANK4_NUMA=4
 export VLLM_RANK5_NUMA=5
 export VLLM_RANK6_NUMA=6
 export VLLM_RANK7_NUMA=7
-#BW集群需要额外设置的环境变量：
+#BW集群需要额外设置的环境变量
 export NCCL_NET_GDR_LEVEL=7
 export NCCL_SDMA_COPY_ENABLE=0
-export VLLM_RPC_TIMEOUT=1800000
 ```
 2. 启动RAY集群
@@ -147,43 +139,40 @@ ray start --address='x.x.x.x:6379' --num-gpus=8 --num-cpus=32
 > intel cpu 需要加参数：`--enforce-eager`
 ```bash
-vllm serve deepseek-v3.2/DeepSeek-V3.2-Exp-bf16 \
+vllm serve deepseek-ai/DeepSeek-V3.2-Exp \
    --trust-remote-code \
    --distributed-executor-backend ray \
    --dtype bfloat16 \
    --tensor-parallel-size 32 \
-    --max-model-len 1024 \
+    --max-model-len 32768 \
-    --max-num-seqs 128 \
    --no-enable-chunked-prefill \
    --no-enable-prefix-caching \
-    --gpu-memory-utilization 0.85 \
+    --port 8001
-    --host 12.12.12.11 \
-    --port 8001 \
-    --kv-cache-dtype bfloat16
 ```
 启动完成后可通过以下方式访问：
 ```bash
-curl http://127.0.0.1:8001/v1/chat/completions \
+curl http://127.0.0.1:8001/v1/chat/completions   \
-    -H "Content-Type: application/json" \
+    -H "Content-Type: application/json"  \
    -d '{
        "model": "deepseek-ai/DeepSeek-V3.2-Exp",
        "messages": [
            {
                "role": "user",
-                "content": "Explain Machine Learning to me in a nutshell."
+                "content": "请介绍下你自己。"
            }
        ],
-        "temperature": 0.15,
+        "max_tokens": 1024,
-        "top_p": 1.0,
+        "temperature": 0.7,
-        "max_tokens": 2048,
+        "chat_template_kwargs": {
-        "stream": false
+            "thinking": false
-}'
+        }
+    }'
 ```
 ## result
 <div align=center>
-    <img src="./doc/results_dcu.jpg"/>
+    <img src="./doc/results_dcu.png"/>
 </div>
 ### 精度

--- a/doc/results_dcu.jpg
+++ b/doc/results_dcu.jpg
--- a/doc/results_dcu.png
+++ b/doc/results_dcu.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
+FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
\ No newline at end of file
--- a/inference/config.json
+++ b/inference/config.json
+{
+  "architectures": [
+    "DeepseekV3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "ep_size": 1,
+  "first_k_dense_replace": 3,
+  "hidden_act": "silu",
+  "hidden_size": 7168,
+  "index_head_dim": 128,
+  "index_n_heads": 64,
+  "index_topk": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 18432,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 163840,
+  "model_type": "deepseek_v3",
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "n_group": 8,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 128,
+  "num_nextn_predict_layers": 1,
+  "q_lora_rank": 1536,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn"
+  },
+  "rope_theta": 10000,
+  "routed_scaling_factor": 2.5,
+  "scoring_func": "sigmoid",
+  "tie_word_embeddings": false,
+  "topk_group": 4,
+  "topk_method": "noaux_tc",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_cache": true,
+  "v_head_dim": 128,
+  "vocab_size": 129280
+}