#!/bin/bash ## 产品部提供的vllm 0.9.2默认环境变量,用于测试非DS 671B模型 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ALLREDUCE_STREAM_WITH_COMPUTE=1 export NCCL_MIN_NCHANNELS=16 export NCCL_MAX_NCHANNELS=16 # export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1 #K100-AI需要 ## 根据TOPO进行NUMA绑核 export VLLM_NUMA_BIND=1 export VLLM_RANK0_NUMA=0 export VLLM_RANK1_NUMA=0 export VLLM_RANK2_NUMA=0 export VLLM_RANK3_NUMA=0 export VLLM_RANK4_NUMA=0 export VLLM_RANK5_NUMA=0 export VLLM_RANK6_NUMA=0 export VLLM_RANK7_NUMA=0 ## 项目上反馈的可能对性能有提升的环境变量,可酌情使用 # export HSA_FORCE_FINE_GRAIN_PCIE=1 # export NCCL_P2P_LEVEL=SYS # export NCCL_LAUNCH_MODE=GROUP # export VLLM_RPC_TIMEOUT=1800000 # export VLLM_SPEC_DECODE_EAGER=1 # export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44 # K100AI设置为44,BW1000去掉该环境变量 # export VLLM_MLA_DISABLE=0 # export VLLM_USE_FLASH_MLA=1 # export VLLM_ZERO_OVERHEAD=1 # 该参数对某些模型性能有明显提升;但在某些环境下表现异常,根据实际情况斟酌选用;且使用后对Qwen3模型会关闭其thinking功能 # export W8A8_SUPPORT_METHODS=3 # 对W8A8的量化模型有提升 # export ROCBLAS_INT8_ENABLE=0 # 对W8A8的量化模型有提升 # export VLLM_USE_FLASH_ATTN_PA=0 # 解决精度乱码问题 # 禁止生成core文件 ulimit -c 0 # 模型参数 model_name=$1 model_path=$2 tp=$3 dtype=$4 host=$5 port=$6 dst_path=$7 case $dtype in "float16" | "bfloat16") extra="--dtype ${dtype}" ;; "gptq-int8" | "gptq-int4") export GPTQ_CK_GEMMBS=15000 extra="--quantization gptq" ;; "compressed-tensors-w8a8") extra="--quantization compressed-tensors" ;; "awq") export AWQ_CK_GEMMBS=15000 extra="--quantization awq" ;; esac # vllm 0.9.2 默认使用V1 Engine及Prefix Caching功能开启(推荐),如果有测试需求可以自行切换V0 Engine或关闭Prefix Caching # 使用V0 Engine:VLLM_USE_V1=0 # 开启/关闭Prefix Caching:--enable-chunked-prefill / --no-enable-prefix-caching date=$(date +"%Y-%m-%d-%H-%M-%S") python3 -m vllm.entrypoints.openai.api_server --served-model-name $model_name --gpu-memory-utilization 0.95 $extra \ --model $model_path --tensor-parallel-size $tp --host $host --port $port --max-model-len 32768 \ 2>&1 | tee ${dst_path}/launcher_${model_name}_tp_${tp}_dtype_${dtype}_${date}.log