1d.sh 1.2 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
export ALLREDUCE_STREAM_WITH_COMPUTE=1 #同流
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')  #ip地址
export VLLM_TORCH_PROFILER_DIR=/workspace #torchprof
export DEBUG_CLR_GRAPH_PACKET_CAPTURE=false
export VLLM_FUSED_MOE_CHUNK_SIZE=8192
export VLLM_USE_GLOBAL_CACHE13=1 #不影响性能,帮助解决oom
export VLLM_ENABLE_TBO=0 #defalut
export VLLM_ZERO_OVERHEAD=1
current_time=$(date +"%Y%m%d-%H%M")

#export VLLM_P2P_ASYNC=1



vllm serve /module3/DeepSeek-R1-0528-W4A8-V2 \
--host 0.0.0.0   \
--port 20009 \
--trust-remote-code \
--dtype bfloat16 \
-q slimquant_w4a8_marlin \
--kv-cache-dtype fp8_e5m2 \
--max-model-len 49152 \
-tp 8 \
--gpu-memory-utilization 0.95 \
--max-num-seqs 256 \
--block-size 64 \
--disable-log-requests  \
--max-num-batched-tokens 8192 \
--no-enable-prefix-caching \
--enable-chunked-prefill \
--speculative_config '{"method": "deepseek_mtp", "num_speculative_tokens": 3}' \
--kv-transfer-config '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"1e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.16.1.115","proxy_port":"30007","http_port":"20009","send_type":"PUT_ASYNC","mem_pool_size_gb":256}}' \
2>&1 | tee 1d_log/2d-${current_time}.log