#!/bin/bash #SBATCH --job-name=grpo_vllm # 作业名称 #SBATCH --output=logs/grpo_vllm_%j.out # 输出日志文件 #SBATCH --error=logs/grpo_vllm_%j.out # 错误日志文件 #SBATCH --nodes=1 # 使用节点数量 #SBATCH --qos=dcudvp #SBATCH --gres=dcu:8 # 每节点 8 张 DCU #SBATCH --cpus-per-task=32 # 每个任务分配 32 个 CPU #SBATCH --partition=dcu # 使用 DCU 分区sinfo #SBATCH --ntasks-per-node=1 #SBATCH --mem=480G #SBATCH --nodelist=xxxxxx # 指定节点 source ~/packages/dtk-25.04.1/env.sh source ~/miniconda3/etc/profile.d/conda.sh conda activate grpo export DISABLE_VERSION_CHECK=1 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export ALLREDUCE_STREAM_WITH_COMPUTE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1 export NCCL_ALGO=Ring export NCCL_PROTO=Simple export NCCL_DEBUG=INFO export NCCL_P2P_LEVEL=SYS export NCCL_IB_DISABLE=1 export VLLM_RPC_TIMEOUT=1800000 export NCCL_IB_TIMEOUT=30 export VLLM_MLA_DISABLE=0 export VLLM_USE_FLASH_MLA=1 export NCCL_SOCKET_IFNAME=ibxxxx # ifconfig查看实际IB网口名 export NCCL_MIN_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32 export NCCL_MIN_P2P_NCHANNELS=32 export NCCL_MAX_P2P_NCHANNELS=32 export NCCL_NCHANNELS_PER_PEER=32 trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor-parallel-size 8 --gpu_memory_utilization 0.8 --port 8001