#!/bin/bash export VLLM_NUMA_BIND=1 # 参考BW1000拓扑结果进行绑核 export VLLM_RANK0_NUMA=3 export VLLM_RANK1_NUMA=1 export VLLM_RANK2_NUMA=1 export VLLM_RANK3_NUMA=0 export VLLM_RANK4_NUMA=7 export VLLM_RANK5_NUMA=5 export VLLM_RANK6_NUMA=5 export VLLM_RANK7_NUMA=4 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_MIN_NCHANNELS=16 export NCCL_MAX_NCHANNELS=16 export NCCL_P2P_LEVEL=SYS export NCCL_LAUNCH_MODE=GROUP export ALLREDUCE_STREAM_WITH_COMPUTE=1 export VLLM_RPC_TIMEOUT=1800000 export VLLM_ZERO_OVERHEAD=1 export VLLM_ZERO_OPT_ZEROS=1 # 测试Qwen3-30B-A3B所需环境变量 export VLLM_USE_FUSED_RMS_ROPE=1 export VLLM_USE_MARLIN_W16A16_MOE=1 # 测试Qwen3-Next需要环境变量 export VLLM_USE_NN=0 export TRITON_MOVE_LOAD_TOFRONT_DOT=0 # 禁止生成core文件 ulimit -c 0 # 模型参数 model_name=$1 model_path=$2 tp=$3 dtype=$4 host=$5 port=$6 dst_path=$7 case $dtype in "float16" | "bfloat16") extra="--dtype ${dtype}" ;; "gptq-int8" | "gptq-int4") export GPTQ_CK_GEMMBS=15000 extra="--quantization gptq" ;; "float8") extra="--quantization fp8" ;; "w8a8") extra="--quantization compressed-tensors" ;; "awq") export AWQ_CK_GEMMBS=15000 extra="--quantization awq" ;; esac # vllm 0.11.0 弃用V0 engine,默认使用V1 Engine # vllm 0.11.0 默认开启 Prefix Caching 功能(推荐),如果有测试需求可以自行关闭 # 开启/关闭Prefix Caching:--enable-prefix-caching / --no-enable-prefix-caching date=$(date +"%Y-%m-%d-%H-%M-%S") vllm serve $model_path \ --served-model-name $model_name \ --tensor-parallel-size $tp \ --host $host --port $port \ --gpu-memory-utilization 0.95 $extra \ --max-model-len 32768 \ --trust-remote-code \ --no-enable-prefix-caching \ --enable-chunked-prefill \ --disable-cascade-attn \ --disable-log-stats \ 2>&1 | tee ${dst_path}/launcher_${model_name}_tp_${tp}_dtype_${dtype}_${date}.log