Fix devices recognize

3d98a379 · chenych · 20247eb8 · 3d98a379 · 3d98a379
Commit 3d98a379 authored May 07, 2025 by chenych
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 1 deletion

verl/single_controller/base/worker.py verl/single_controller/base/worker.py +8 -0

verl/workers/fsdp_workers.py verl/workers/fsdp_workers.py +7 -1

No files found.
--- a/verl/single_controller/base/worker.py
+++ b/verl/single_controller/base/worker.py
@@ -135,6 +135,14 @@ class Worker(WorkerHelper):
            cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
            torch.cuda.set_device(int(cuda_visible_devices))

+        ## for DCU K100_AI, 通过 torch.cuda.get_device_name() 获取 device_name
+        if "K500SM_AI" in torch.cuda.get_device_name():
+            print("Init DCU Devices")
+            os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("HIP_VISIBLE_DEVICES")
+            os.environ["LOCAL_RANK"] = os.getenv("RAY_LOCAL_RANK")
+            cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
+            torch.cuda.set_device(int(cuda_visible_devices))
+
        master_addr = os.getenv("MASTER_ADDR")
        master_port = os.getenv("MASTER_PORT")


--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -15,6 +15,8 @@
 The main entry point to run the PPO algorithm
 """

+import os
+
 from typing import Literal, Optional, Union

 import numpy as np
@@ -71,7 +73,9 @@ class FSDPWorker(Worker):
        self.role = role

        if not dist.is_initialized():
+            self.print_rank0("Initializing distributed process group...")
            dist.init_process_group(backend="nccl")
+            print(f"!!! Rank {dist.get_rank()} initialized successfully!")

        # improve numerical stability
        torch.backends.cuda.matmul.allow_tf32 = False
@@ -265,7 +269,9 @@ class FSDPWorker(Worker):

        # rank = torch.cuda.set_device(self.rank)
        # model = model.to(rank)
-        print(f"!!! local_rank={self.rank}, torch.cuda.current_device()={torch.cuda.current_device()}")
+        local_rank = int(os.environ["LOCAL_RANK"])
+        print(f"!!! rank={self.rank}, local_rank={local_rank}, torch.cuda.current_device()={torch.cuda.current_device()}")
+        print(f"self.device_mesh = {self.device_mesh}")
        self.fsdp_module = FSDP(
            model,
            sharding_strategy=sharding_strategy,