"cacheflow/core/block_manager.py" did not exist on "a2a9869cb7e11a46c215e0cd55401509395c035d"
vllm_serve.sh 808 Bytes
Newer Older
sunzhq2's avatar
init  
sunzhq2 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
export VLLM_NUMA_BIND=1 
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=0
export VLLM_RANK2_NUMA=0
export VLLM_RANK3_NUMA=0

export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export NCCL_P2P_LEVEL=SYS
export NCCL_LAUNCH_MODE=GROUP
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_RPC_TIMEOUT=1800000

export VLLM_ZERO_OVERHEAD=1
export VLLM_ZERO_OPT_ZEROS=1

# 测试Qwen3-30B-A3B所需环境变量
export VLLM_USE_FUSED_RMS_ROPE=1
export VLLM_USE_MARLIN_W16A16_MOE=1

# 测试Qwen3-Next需要环境变量
export VLLM_USE_NN=0
export TRITON_MOVE_LOAD_TOFRONT_DOT=0

export HIP_VISIBLE_DEVICES=6,7
vllm serve /data2/models/qwen3-8B  \
    --served-model-name qwen3-8B \
    --host 0.0.0.0 \
    --port 8000 \
    --trust-remote-code \
    --tensor-parallel-size 2