Update README

5468d41a · chenych · b93a5a38 · 5468d41a · 5468d41a
Commit 5468d41a authored Oct 30, 2025 by chenych
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 9 deletions

README.md README.md +13 -4

train.sh train.sh +5 -5

No files found.
--- a/README.md
+++ b/README.md
@@ -16,7 +16,6 @@ cd trl-v0.19.0
 pip install -e .
 cd ../llama-grpo
 pip install transformers==4.51.3
-bash train.sh x
 ```
 ### Dockerfile（方法二）
@@ -36,7 +35,6 @@ cd trl-v0.19.0
 pip install -e .
 cd ../llama-grpo
 pip install transformers==4.51.3
-bash train.sh x
 ```
 ### Anaconda（方法三）
@@ -76,7 +74,18 @@ sbatch sbatch_vllm.sh
 sbatch sbatch_train.sh
 ```
+## 已知问题
+1. 如果报错：`RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method`
+解决方法如下：
+修改 `trl-v0.19.0/trl/scripts/vllm_serve.py` 文件
+```bash
+# os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" ## 将这个环境变量注释
+## 添加下面的代码
-## 已知问题
+from multiprocessing import set_start_method
-如果出现`cannot re`
+try:
\ No newline at end of file
+    set_start_method('spawn')
+except RuntimeError:
+    pass
+```
\ No newline at end of file
--- a/train.sh
+++ b/train.sh
@@ -27,11 +27,11 @@ export VLLM_USE_FLASH_MLA=1
 ## nnodes 代表几台机器，请根据实际情况填写数量,eg: 2，代表2台机器
 torchrun --nproc_per_node=8 \
    --nnodes=xxxx\
-    --node-rank=\${RANK} \
+    --node-rank=${RANK} \
-    --master_addr=\${MASTER_ADDR} \
+    --master_addr=${MASTER_ADDR} \
-    --master_port=\${MASTER_PORT} \
+    --master_port=${MASTER_PORT} \
    src/train.py   \
-    --deepspeed ~/GRPO/deepspeed/ds_z3_config.json \
+    --deepspeed examples/deepspeed/ds_z3_config.json \
    --stage grpo \
    --do_train \
    --finetuning_type freeze \
@@ -57,7 +57,7 @@ torchrun --nproc_per_node=8 \
    --learning_rate 5e-3 \
    --lr_scheduler_type cosine \
    --logging_steps 1 \
-    --cutoff_len 8192 \
+    --cutoff_len 2048 \
    --save_steps 100 \
    --plot_loss True \
    --num_train_epochs 1 \