Commit b93a5a38 authored by chenych's avatar chenych
Browse files

Add README

parent c7c477c7
# 基于llama-factory启动的GRPO训练仓库
## 环境配置
### Docker(方法一)
```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
cd /your_code_path/llama-grpo
pip install -e .
pip uninstall trl
cd ../
git clone -b v0.19.0 https://github.com/huggingface/trl.git
mv trl trl-v0.19.0
cd trl-v0.19.0
pip install -e .
cd ../llama-grpo
pip install transformers==4.51.3
bash train.sh x
```
### Dockerfile(方法二)
```bash
cd docker
docker build --no-cache -t llama-grpo:latest .
docker run -it --shm-size 200g --network=host --name {docker_name} --privileged --device=/dev/kfd --device=/dev/dri --device=/dev/mkfd --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal/:/opt/hyhal/:ro {imageID} bash
cd /your_code_path/llama-grpo
pip install -e .
pip uninstall trl
cd ../
git clone -b v0.19.0 https://github.com/huggingface/trl.git
mv trl trl-v0.19.0
cd trl-v0.19.0
pip install -e .
cd ../llama-grpo
pip install transformers==4.51.3
bash train.sh x
```
### Anaconda(方法三)
关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
```bash
DTK: 25.04.1
python: 3.10.12
torch: 2.4.1+das.opt1.dtk25041
vllm: 0.8.5
transformers: 4.51.3
deepspeed: 0.14.2+das.opt1.dtk25041
```
## 训练方法
### 多节点启动
1. 启动`trl vllm-serve`
```bash
bash start_vllm_serve.sh
```
2. 启动训练
需要在每个服务器分别启动下面脚本,第一台服务器x=0,第二台服务器x=1,依次类推
其他参数请参考脚本内的参数对应说明进行配置修改
```bash
bash train.sh x
```
### slurm启动
1. vllm-serve启动
```bash
sbatch sbatch_vllm.sh
```
2. 训练启动
需要在`sbatch_train.sh`脚本内修改参数,并提交任务
```bash
sbatch sbatch_train.sh
```
## 已知问题
如果出现`cannot re`
\ No newline at end of file
{
"dapo_math": {
"hf_hub_url": "open-r1/DAPO-Math-17k-Processed",
"split": "train",
"subset": "all",
"columns": {
"prompt": "prompt",
"response": "solution"
}
},
"OpenMathReasoning-mini": {
"hf_hub_url": "unsloth/OpenMathReasoning-mini",
"split": "train",
"columns": {
"prompt": "problem",
"response": "expected_answer"
}
},
"hiyouga-math12k": {
"hf_hub_url": "hiyouga/math12k",
"split": "train",
"columns": {
"prompt": "problem",
"response": "answer"
}
},
"identity": {
"file_name": "identity.json"
},
......
FROM docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
\ No newline at end of file
FROM docker pull image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250724
#!/bin/bash
#SBATCH --job-name=grpo_train # 作业名称
#SBATCH --output=logs/grpo_train_%j.out # 输出日志文件
#SBATCH --error=logs/grpo_train_%j.out # 错误日志文件
#SBATCH --nodes=2 # 使用节点数量
#SBATCH --qos=dcudvp
#SBATCH --gres=dcu:8 # 每节点 8 张 DCU
#SBATCH --cpus-per-task=32 # 每个任务分配 32 个 CPU
#SBATCH --partition=dcu # 使用 DCU 分区sinfo
#SBATCH --ntasks-per-node=1
#SBATCH --mem=960G
NODE_LIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
for RANK in "${!NODE_LIST[@]}"; do
node="${NODE_LIST[$RANK]}"
srun --nodes=1 --exclusive -w $node bash <<EOF &
source ~/packages/dtk-25.04.1/env.sh
source ~/miniconda3/etc/profile.d/conda.sh
conda activate grpo
export DISABLE_VERSION_CHECK=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # 默认节点8张卡
export HSA_FORCE_FINE_GRAIN_PCIE=1
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export HF_ENDPOINT=https://hf-mirror.com
export MASTER_ADDR=XXXXXX ## 实际启动mastet节点 hostname或者IP地址
export MASTER_ADDR=${NODE_LIST[0]}
export RANK=$RANK
export MASTER_PORT=29568
export WORLD_SIZE=$((8 * ${#NODE_LIST[@]}))
export NCCL_SOCKET_IFNAME=ibxxxxx # ifconfig查看实际IB网口名
export NCCL_DEBUG=INFO
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_P2P_NCHANNELS=32
export NCCL_MAX_P2P_NCHANNELS=32
export NCCL_NCHANNELS_PER_PEER=32
export VLLM_RPC_TIMEOUT=1800000
export NCCL_IB_TIMEOUT=30
export VLLM_MLA_DISABLE=0
export VLLM_USE_FLASH_MLA=1
echo "分配的节点列表: $SLURM_NODELIST"
echo "主节点地址: \$MASTER_ADDR"
echo "主节点端口: \$MASTER_PORT"
echo "总进程数: \$WORLD_SIZE"
sleep \$((RANK*3))
DISTRIBUTED_ARGS="
--nproc_per_node=8 \
--nnodes=\$SLURM_JOB_NUM_NODES \
--node-rank=\${RANK} \
--master_addr=\${MASTER_ADDR} \
--master_port=\${MASTER_PORT}
"
torchrun \$DISTRIBUTED_ARGS /path/of/llama-factory/src/train.py \
--deepspeed /path/of/deepspeed/ds_z3_config.json \
--stage grpo \
--do_train \
--finetuning_type freeze \
--freeze_trainable_layers 5 \
--freeze_trainable_modules all \
--model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Llama-70B \
--dataset_dir /path/of/llama-factory-0923/data/ \
--dataset dapo_math,hiyouga-math12k \
--max_samples 20000 \
--template deepseekr1 \
--output_dir /path/of/saves/DeepSeek-R1-Distill-Llama-70B-0923/grpo/full/ \
--overwrite_output_dir \
--trust_remote_code \
--warmup_ratio 0.1 \
--max_grad_norm 1.0 \
--weight_decay 0.1 \
--repetition_penalty 50 \
--top_k 50 \
--top_p 0.8 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--preprocessing_num_workers 16 \
--ddp_timeout 120000000 \
--learning_rate 5e-3 \
--lr_scheduler_type cosine \
--optim paged_adamw_32bit \
--logging_steps 1 \
--cutoff_len 8192 \
--save_steps 100 \
--plot_loss True \
--num_train_epochs 1 \
--bf16 \
--seed 42 \
--report_to none \
--save_only_model
EOF
done
wait
\ No newline at end of file
#!/bin/bash
#SBATCH --job-name=grpo_vllm # 作业名称
#SBATCH --output=logs/grpo_vllm_%j.out # 输出日志文件
#SBATCH --error=logs/grpo_vllm_%j.out # 错误日志文件
#SBATCH --nodes=1 # 使用节点数量
#SBATCH --qos=dcudvp
#SBATCH --gres=dcu:8 # 每节点 8 张 DCU
#SBATCH --cpus-per-task=32 # 每个任务分配 32 个 CPU
#SBATCH --partition=dcu # 使用 DCU 分区sinfo
#SBATCH --ntasks-per-node=1
#SBATCH --mem=480G
#SBATCH --nodelist=xxxxxx # 指定节点
source ~/packages/dtk-25.04.1/env.sh
source ~/miniconda3/etc/profile.d/conda.sh
conda activate grpo
export DISABLE_VERSION_CHECK=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_DEBUG=INFO
export NCCL_P2P_LEVEL=SYS
export NCCL_IB_DISABLE=1
export VLLM_RPC_TIMEOUT=1800000
export NCCL_IB_TIMEOUT=30
export VLLM_MLA_DISABLE=0
export VLLM_USE_FLASH_MLA=1
export NCCL_SOCKET_IFNAME=ibxxxx # ifconfig查看实际IB网口名
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_P2P_NCHANNELS=32
export NCCL_MAX_P2P_NCHANNELS=32
export NCCL_NCHANNELS_PER_PEER=32
trl vllm-serve --model deepseek-ai/DeepSeek-R1-Distill-Llama-70B --tensor-parallel-size 8 --gpu_memory_utilization 0.8 --port 8001
\ No newline at end of file
#!/bin/bash
export DISABLE_VERSION_CHECK=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export HSA_FORCE_FINE_GRAIN_PCIE=1
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export LLAMA_NN=0
export HF_ENDPOINT=https://hf-mirror.com
export VLLM_MLA_DISABLE=0
export VLLM_USE_FLASH_MLA=1
export NCCL_SOCKET_IFNAME=ibp58s0
export NCCL_DEBUG=INFO
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_P2P_NCHANNELS=32
export NCCL_MAX_P2P_NCHANNELS=32
export NCCL_NCHANNELS_PER_PEER=32
export VLLM_RPC_TIMEOUT=1800000
export NCCL_IB_TIMEOUT=30
# export VLLM_WORKER_MULTIPROC_METHOD="spawn"
trl vllm-serve --model llama3/Meta-Llama-3-70B-Instruct --tensor-parallel-size 8 --gpu_memory_utilization 0.8 --port 8001
#!/bin/bash
export DISABLE_VERSION_CHECK=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # 默认节点8张卡
export HSA_FORCE_FINE_GRAIN_PCIE=1
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export HF_ENDPOINT=https://hf-mirror.com
export MASTER_ADDR=XXXXXX ## 实际启动mastet节点 hostname或者IP地址
export MASTER_PORT=29569
export RANK=$1
export NCCL_SOCKET_IFNAME=ibxxxxx # ifconfig查看实际IB网口名
export NCCL_DEBUG=INFO
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_P2P_NCHANNELS=32
export NCCL_MAX_P2P_NCHANNELS=32
export NCCL_NCHANNELS_PER_PEER=32
export VLLM_RPC_TIMEOUT=1800000
export NCCL_IB_TIMEOUT=30
export VLLM_MLA_DISABLE=0
export VLLM_USE_FLASH_MLA=1
## nnodes 代表几台机器,请根据实际情况填写数量,eg: 2,代表2台机器
torchrun --nproc_per_node=8 \
--nnodes=xxxx\
--node-rank=\${RANK} \
--master_addr=\${MASTER_ADDR} \
--master_port=\${MASTER_PORT} \
src/train.py \
--deepspeed ~/GRPO/deepspeed/ds_z3_config.json \
--stage grpo \
--do_train \
--finetuning_type freeze \
--freeze_trainable_layers 5 \
--freeze_trainable_modules all \
--model_name_or_path deepseek-ai/DeepSeek-R1-Distill-Llama-70B \
--dataset dapo_math,hiyouga-math12k \
--max_samples 20000 \
--template deepseekr1 \
--output_dir saves/DeepSeek-R1-Distill-Llama-70B-0923/grpo/full/ \
--overwrite_output_dir \
--trust_remote_code \
--warmup_ratio 0.1 \
--max_grad_norm 1.0 \
--weight_decay 0.1 \
--repetition_penalty 50 \
--top_k 50 \
--top_p 0.8 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--preprocessing_num_workers 16 \
--ddp_timeout 120000000 \
--learning_rate 5e-3 \
--lr_scheduler_type cosine \
--logging_steps 1 \
--cutoff_len 8192 \
--save_steps 100 \
--plot_loss True \
--num_train_epochs 1 \
--bf16 \
--seed 42 \
--report_to none \
--save_only_model
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment