export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_MAX_NCHANNELS=16 export NCCL_MIN_NCHANNELS=16 export NCCL_P2P_LEVEL=SYS export NCCL_LAUNCH_MODE=GROUP export ROCBLAS_COMPUTETYPE_FP16R=0 export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH export VLLM_NUMA_BIND=1 export VLLM_RANK0_NUMA=3 export VLLM_RANK1_NUMA=1 export VLLM_RANK2_NUMA=1 export VLLM_RANK3_NUMA=0 export VLLM_RANK4_NUMA=7 export VLLM_RANK5_NUMA=5 export VLLM_RANK6_NUMA=5 export VLLM_RANK7_NUMA=4 vllm serve /workspace/llms/bf16_model/ --trust-remote-code --distributed-executor-backend ray --dtype bfloat16 --max-model-len 24000 --max-seq-len-to-capture 24000 -tp 32 --gpu-memory-utilization 0.9 --max-num-seqs 128 --speculative_config '{"num_speculative_tokens": 1}' --block-size 64 --disable-log-requests