Commit 024e6d37 authored by chenych's avatar chenych
Browse files

Fix bugs and complete vllm serve

parent 590059ff
......@@ -21,16 +21,10 @@ DCU型号:K100AI,节点数量:4台,卡数:32 张。
### Docker(方法一)
```bash
dcoker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
dcoker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
## 安装transformers
git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
cd transformers
pip install -e .
cd /your_code_path/deepseek-v3.2-exp_pytorch
```
### Dockerfile(方法二)
......@@ -40,13 +34,7 @@ docker build --no-cache -t deepseek-v3.2-exp:latest .
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
## 安装transformers
git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
cd transformers
pip install -e .
cd /your_code_path/deepseek-v3.2-exp_pytorch
```
### Anaconda(方法三)
......@@ -55,16 +43,12 @@ cd /your_code_path/deepseek-v3.2-exp_pytorch
DTK: 25.04.1
python: 3.10.12
torch: 2.5.1+das.opt1.dtk25041
transformers: 4.56.1
```
`Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`, 其他组件安装方法如下:
`Tips:以上dtk驱动、pytorch等DCU相关工具版本需要严格一一对应`,其他包安装如下:
```bash
## 安装transformers
git clone -b add-deepseek-exp https://github.com/huggingface/transformers.git
cd transformers
pip install -e .
cd /your_code_path/deepseek-v3.2-exp_pytorch
http://10.16.4.1:8000/customized/vllm/dtk25.04.1/v0.9.2-dsv32/v0.9.2-dsv32-cf360956/vllm-0.9.2%2Bdas.opt1.rc2.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
pip install vllm-0.9.2+das.opt1.rc2.dtk25041-cp310-cp310-manylinux_2_28_x86_64.whl
```
## 数据集
......@@ -76,16 +60,14 @@ cd /your_code_path/deepseek-v3.2-exp_pytorch
## 推理
样例模型:[DeepSeek-V3.2-Exp](https://huggingface.co/deepseek-ai/DeepSeek-V3.2-Exp)
首先将模型转换成bf16格式,转换完成后,将原模型中的 `config.json`, `generation_config.json`, `tokenizer_config.json`, `tokenizer.json`拷贝到`/path/to/DeepSeek-V3.2-Exp-bf16`中,并删掉`config.json`中的`quantization_config`字段,如下图所示。
<div align=center>
<img src="./doc/config.png"/>
</div>
首先将模型转换成bf16格式,转换命令如下:
```bash
# fp8转bf16
python inference/fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp --output-bf16-hf-path /path/to/DeepSeek-V3.2-Exp-bf16
# 拷贝config文件
cp inference/config.json /path/to/DeepSeek-V3.2-Exp-bf16
```
转换完成后,将原模型中的 `generation_config.json`, `tokenizer_config.json`, `tokenizer.json`拷贝到`/path/to/DeepSeek-V3.2-Exp-bf16`中。
### vllm推理方法
#### server 多机
......@@ -108,16 +90,26 @@ python inference/fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-V3.2-Exp
```bash
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_HOST_IP=x.x.x.x # 对应计算节点的IP,建议选择IB口SOCKET_IFNAME对应IP地址
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_SOCKET_IFNAME=ibxxxx
export GLOO_SOCKET_IFNAME=ibxxxx
export NCCL_IB_HCA=mlx5_0:1 # 环境中的IB网卡名字
export NCCL_IB_HCA=mlx5_0:1
unset NCCL_ALGO
export NCCL_MIN_NCHANNELS=16
export NCCL_IB_DISABLE=0
export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=16
export NCCL_NET_GDR_READ=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_DEBUG=INFO
export NCCL_MIN_P2P_NCHANNELS=16
export NCCL_NCHANNELS_PER_PEER=16
export HIP_USE_GRAPH_QUEUE_POOL=1
export VLLM_ENABLE_MOE_FUSED_GATE=0
export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44
export VLLM_RPC_TIMEOUT=1800000
export VLLM_USE_FLASH_MLA=1
# 海光CPU绑定核
# 海光CPU绑定核,intel cpu可不加
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
......@@ -127,10 +119,10 @@ export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
#BW集群需要额外设置的环境变量:
#BW集群需要额外设置的环境变量
export NCCL_NET_GDR_LEVEL=7
export NCCL_SDMA_COPY_ENABLE=0
export VLLM_RPC_TIMEOUT=1800000
```
2. 启动RAY集群
......@@ -147,43 +139,40 @@ ray start --address='x.x.x.x:6379' --num-gpus=8 --num-cpus=32
> intel cpu 需要加参数:`--enforce-eager`
```bash
vllm serve deepseek-v3.2/DeepSeek-V3.2-Exp-bf16 \
vllm serve deepseek-ai/DeepSeek-V3.2-Exp \
--trust-remote-code \
--distributed-executor-backend ray \
--dtype bfloat16 \
--tensor-parallel-size 32 \
--max-model-len 1024 \
--max-num-seqs 128 \
--max-model-len 32768 \
--no-enable-chunked-prefill \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.85 \
--host 12.12.12.11 \
--port 8001 \
--kv-cache-dtype bfloat16
--port 8001
```
启动完成后可通过以下方式访问:
```bash
curl http://127.0.0.1:8001/v1/chat/completions \
-H "Content-Type: application/json" \
curl http://127.0.0.1:8001/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-V3.2-Exp",
"messages": [
{
"role": "user",
"content": "Explain Machine Learning to me in a nutshell."
"content": "请介绍下你自己。"
}
],
"temperature": 0.15,
"top_p": 1.0,
"max_tokens": 2048,
"stream": false
}'
"max_tokens": 1024,
"temperature": 0.7,
"chat_template_kwargs": {
"thinking": false
}
}'
```
## result
<div align=center>
<img src="./doc/results_dcu.jpg"/>
<img src="./doc/results_dcu.png"/>
</div>
### 精度
......
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/vllm:0.9.2-ubuntu22.04-dtk25.04.1-rc5-rocblas104381-0915-das1.6-py3.10-20250916-rc2-ds3.2
\ No newline at end of file
{
"architectures": [
"DeepseekV3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 0,
"eos_token_id": 1,
"ep_size": 1,
"first_k_dense_replace": 3,
"hidden_act": "silu",
"hidden_size": 7168,
"index_head_dim": 128,
"index_n_heads": 64,
"index_topk": 2048,
"initializer_range": 0.02,
"intermediate_size": 18432,
"kv_lora_rank": 512,
"max_position_embeddings": 163840,
"model_type": "deepseek_v3",
"moe_intermediate_size": 2048,
"moe_layer_freq": 1,
"n_group": 8,
"n_routed_experts": 256,
"n_shared_experts": 1,
"norm_topk_prob": true,
"num_attention_heads": 128,
"num_experts_per_tok": 8,
"num_hidden_layers": 61,
"num_key_value_heads": 128,
"num_nextn_predict_layers": 1,
"q_lora_rank": 1536,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"beta_fast": 32,
"beta_slow": 1,
"factor": 40,
"mscale": 1.0,
"mscale_all_dim": 1.0,
"original_max_position_embeddings": 4096,
"type": "yarn"
},
"rope_theta": 10000,
"routed_scaling_factor": 2.5,
"scoring_func": "sigmoid",
"tie_word_embeddings": false,
"topk_group": 4,
"topk_method": "noaux_tc",
"torch_dtype": "bfloat16",
"transformers_version": "4.44.2",
"use_cache": true,
"v_head_dim": 128,
"vocab_size": 129280
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment