#!/bin/bash export DISABLE_VERSION_CHECK=1 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # 默认节点8张卡 export HSA_FORCE_FINE_GRAIN_PCIE=1 export ALLREDUCE_STREAM_WITH_COMPUTE=1 export HF_ENDPOINT=https://hf-mirror.com export MASTER_ADDR=XXXXXX ## 实际启动mastet节点 hostname或者IP地址 export MASTER_PORT=29569 export RANK=$1 export NCCL_SOCKET_IFNAME=ibxxxxx # ifconfig查看实际IB网口名 export NCCL_DEBUG=INFO export NCCL_ALGO=Ring export NCCL_PROTO=Simple export NCCL_MIN_NCHANNELS=32 export NCCL_MAX_NCHANNELS=32 export NCCL_MIN_P2P_NCHANNELS=32 export NCCL_MAX_P2P_NCHANNELS=32 export NCCL_NCHANNELS_PER_PEER=32 export VLLM_RPC_TIMEOUT=1800000 export NCCL_IB_TIMEOUT=30 export VLLM_MLA_DISABLE=0 export VLLM_USE_FLASH_MLA=1 ## nnodes 代表几台机器,请根据实际情况填写数量,eg: 2,代表2台机器 torchrun --nproc_per_node=8 \ --nnodes=xxxx\ --node-rank=${RANK} \ --master_addr=${MASTER_ADDR} \ --master_port=${MASTER_PORT} \ src/train.py \ --deepspeed examples/deepspeed/ds_z3_config.json \ --stage grpo \ --do_train \ --finetuning_type freeze \ --freeze_trainable_layers 5 \ --freeze_trainable_modules all \ --model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Llama-70B \ --dataset dapo_math,hiyouga-math12k \ --max_samples 20000 \ --template deepseekr1 \ --output_dir saves/DeepSeek-R1-Distill-Llama-70B-0923/grpo/full/ \ --overwrite_output_dir \ --trust_remote_code \ --warmup_ratio 0.1 \ --max_grad_norm 1.0 \ --weight_decay 0.1 \ --repetition_penalty 50 \ --top_k 50 \ --top_p 0.8 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 4 \ --preprocessing_num_workers 16 \ --ddp_timeout 120000000 \ --learning_rate 5e-3 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --cutoff_len 2048 \ --save_steps 100 \ --plot_loss True \ --num_train_epochs 1 \ --bf16 \ --seed 42 \ --report_to none \ --save_only_model