Fix ray.init bug and update README

827b4040 · chenych · 496acb03 · 827b4040 · 827b4040 · 827b4040
Commit 827b4040 authored Jul 29, 2025 by chenych
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 3 deletions

README.md README.md +4 -0

verl/trainer/main.py verl/trainer/main.py +2 -2

verl/workers/rollout/vllm_rollout_spmd.py verl/workers/rollout/vllm_rollout_spmd.py +1 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -169,3 +169,7 @@ python3 scripts/model_merger.py --local_dir path_to_your_actor_checkpoint
 > RuntimeError: No HIP GPUs are available
 在`~/.bashrc`中新增 `export HIP_VISIBLE_DEVICES=0,1,2,3`，并重新source下环境
+> ImportError: cannot import name 'index_first_axis' from 'transformers.modeling_flash_attention_utils'
+降低transformers版本，Qwen2.5可参考版本：4.51.3
\ No newline at end of file
--- a/verl/trainer/main.py
+++ b/verl/trainer/main.py
@@ -111,13 +111,13 @@ def main():
                "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:False",
            }
        }
-        ray.init(runtime_env=runtime_env)        # this is for local ray cluster
        if torch.version.hip is not None:
            ray.init(num_gpus=torch.cuda.device_count(),  ## for dcu devices
                        ignore_reinit_error=True,
                        runtime_env=runtime_env)
        else:
-            ray.init(runtime_env=runtime_env)
+            ray.init(runtime_env=runtime_env)  # this is for local ray cluster
    runner = Runner.remote()
    ray.get(runner.run.remote(ppo_config))

--- a/verl/workers/rollout/vllm_rollout_spmd.py
+++ b/verl/workers/rollout/vllm_rollout_spmd.py
@@ -89,7 +89,7 @@ class vLLMRollout(BaseRollout):
        )
        # Offload vllm model to reduce peak memory usage
-        self.inference_engine.sleep(level=1)
+        # self.inference_engine.sleep(level=1)
        sampling_kwargs = {
            "max_tokens": config.response_length,