support DCU

2eaae45d · chenych · 3d98a379 · 2eaae45d · 2eaae45d
Commit 2eaae45d authored May 07, 2025 by chenych
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 19 deletions

verl/single_controller/base/worker.py verl/single_controller/base/worker.py +5 -13

verl/workers/fsdp_workers.py verl/workers/fsdp_workers.py +0 -6

No files found.
--- a/verl/single_controller/base/worker.py
+++ b/verl/single_controller/base/worker.py
@@ -129,19 +129,11 @@ class Worker(WorkerHelper):
        self._rank = rank
        self._world_size = world_size
-        if "AMD" in torch.cuda.get_device_name():
+        # DCU support
-            os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("ROCR_VISIBLE_DEVICES")
+        os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("HIP_VISIBLE_DEVICES")
-            os.environ["LOCAL_RANK"] = os.getenv("RAY_LOCAL_RANK")
+        os.environ["LOCAL_RANK"] = os.getenv("RAY_LOCAL_RANK")
-            cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
+        cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
-            torch.cuda.set_device(int(cuda_visible_devices))
+        torch.cuda.set_device(int(cuda_visible_devices))
-        ## for DCU K100_AI, 通过 torch.cuda.get_device_name() 获取 device_name
-        if "K500SM_AI" in torch.cuda.get_device_name():
-            print("Init DCU Devices")
-            os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("HIP_VISIBLE_DEVICES")
-            os.environ["LOCAL_RANK"] = os.getenv("RAY_LOCAL_RANK")
-            cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
-            torch.cuda.set_device(int(cuda_visible_devices))
        master_addr = os.getenv("MASTER_ADDR")
        master_port = os.getenv("MASTER_PORT")

--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -75,7 +75,6 @@ class FSDPWorker(Worker):
        if not dist.is_initialized():
            self.print_rank0("Initializing distributed process group...")
            dist.init_process_group(backend="nccl")
-            print(f"!!! Rank {dist.get_rank()} initialized successfully!")
        # improve numerical stability
        torch.backends.cuda.matmul.allow_tf32 = False
@@ -267,11 +266,6 @@ class FSDPWorker(Worker):
            sync_module_states = False
            param_init_fn = None
-        # rank = torch.cuda.set_device(self.rank)
-        # model = model.to(rank)
-        local_rank = int(os.environ["LOCAL_RANK"])
-        print(f"!!! rank={self.rank}, local_rank={local_rank}, torch.cuda.current_device()={torch.cuda.current_device()}")
-        print(f"self.device_mesh = {self.device_mesh}")
        self.fsdp_module = FSDP(
            model,
            sharding_strategy=sharding_strategy,