run_apiserver.sh 2.38 KB
Newer Older
liuxu3's avatar
liuxu3 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash

## 产品部提供的vllm 0.9.2默认环境变量,用于测试非DS 671B模型
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
# export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1 #K100-AI需要

## 根据TOPO进行NUMA绑核
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=0
export VLLM_RANK2_NUMA=0
export VLLM_RANK3_NUMA=0
export VLLM_RANK4_NUMA=0
export VLLM_RANK5_NUMA=0
export VLLM_RANK6_NUMA=0
export VLLM_RANK7_NUMA=0

## 项目上反馈的可能对性能有提升的环境变量,可酌情使用
# export HSA_FORCE_FINE_GRAIN_PCIE=1
# export NCCL_P2P_LEVEL=SYS
# export NCCL_LAUNCH_MODE=GROUP
# export VLLM_RPC_TIMEOUT=1800000
# export VLLM_SPEC_DECODE_EAGER=1
# export VLLM_ENFORCE_EAGER_BS_THRESHOLD=44   # K100AI设置为44,BW1000去掉该环境变量

# export VLLM_MLA_DISABLE=0
# export VLLM_USE_FLASH_MLA=1
# export VLLM_ZERO_OVERHEAD=1     # 该参数对某些模型性能有明显提升;但在某些环境下表现异常,根据实际情况斟酌选用;且使用后对Qwen3模型会关闭其thinking功能

# export W8A8_SUPPORT_METHODS=3 # 对W8A8的量化模型有提升
# export ROCBLAS_INT8_ENABLE=0 # 对W8A8的量化模型有提升

# export VLLM_USE_FLASH_ATTN_PA=0 # 解决精度乱码问题

# 禁止生成core文件
ulimit -c 0

# 模型参数
model_name=$1
model_path=$2
tp=$3
dtype=$4
host=$5
port=$6
dst_path=$7
case $dtype in
    "float16" | "bfloat16")
        extra="--dtype ${dtype}" ;;
    "gptq-int8" | "gptq-int4")
        export GPTQ_CK_GEMMBS=15000
        extra="--quantization gptq" ;;
    "compressed-tensors-w8a8")
        extra="--quantization compressed-tensors" ;;
    "awq")
        export AWQ_CK_GEMMBS=15000
        extra="--quantization awq" ;;
esac

# vllm 0.9.2 默认使用V1 Engine及Prefix Caching功能开启(推荐),如果有测试需求可以自行切换V0 Engine或关闭Prefix Caching
# 使用V0 Engine:VLLM_USE_V1=0
# 开启/关闭Prefix Caching:--enable-chunked-prefill / --no-enable-prefix-caching

date=$(date +"%Y-%m-%d-%H-%M-%S")
python3 -m vllm.entrypoints.openai.api_server --served-model-name $model_name --gpu-memory-utilization 0.95 $extra \
    --model $model_path --tensor-parallel-size $tp --host $host --port $port --max-model-len 32768 \
     2>&1 | tee ${dst_path}/launcher_${model_name}_tp_${tp}_dtype_${dtype}_${date}.log