Commit fa3ffb43 authored by Julien Denize's avatar Julien Denize Committed by Kevin H. Luu
Browse files

[BugFix] Ray with multiple nodes (#28873)


Signed-off-by: default avatarJulien Denize <julien.denize@mistral.ai>
(cherry picked from commit cdeec2e6)
parent 6d597436
...@@ -204,14 +204,14 @@ class Worker(WorkerBase): ...@@ -204,14 +204,14 @@ class Worker(WorkerBase):
assert self.local_rank < torch.cuda.device_count(), ( assert self.local_rank < torch.cuda.device_count(), (
f"DP adjusted local rank {self.local_rank} is out of bounds. " f"DP adjusted local rank {self.local_rank} is out of bounds. "
) )
visible_device_count = ( visible_device_count = (
torch.cuda.device_count() if torch.cuda.is_available() else 0 torch.cuda.device_count() if torch.cuda.is_available() else 0
) )
assert self.parallel_config.local_world_size <= visible_device_count, ( assert self.parallel_config.local_world_size <= visible_device_count, (
f"local_world_size ({self.parallel_config.local_world_size}) must be " f"local_world_size ({self.parallel_config.local_world_size}) must "
f"less than or equal to the number of visible devices " f"be less than or equal to the number of visible devices "
f"({visible_device_count})." f"({visible_device_count})."
) )
self.device = torch.device(f"cuda:{self.local_rank}") self.device = torch.device(f"cuda:{self.local_rank}")
current_platform.set_device(self.device) current_platform.set_device(self.device)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment