Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
EasyR1
Commits
3d98a379
"docs/vscode:/vscode.git/clone" did not exist on "fe2b6ca6e8cdf652e36d48f5a88c58f13c53ad8c"
Commit
3d98a379
authored
May 07, 2025
by
chenych
Browse files
Fix devices recognize
parent
20247eb8
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
1 deletion
+15
-1
verl/single_controller/base/worker.py
verl/single_controller/base/worker.py
+8
-0
verl/workers/fsdp_workers.py
verl/workers/fsdp_workers.py
+7
-1
No files found.
verl/single_controller/base/worker.py
View file @
3d98a379
...
@@ -135,6 +135,14 @@ class Worker(WorkerHelper):
...
@@ -135,6 +135,14 @@ class Worker(WorkerHelper):
cuda_visible_devices
=
os
.
getenv
(
"LOCAL_RANK"
,
"0"
)
cuda_visible_devices
=
os
.
getenv
(
"LOCAL_RANK"
,
"0"
)
torch
.
cuda
.
set_device
(
int
(
cuda_visible_devices
))
torch
.
cuda
.
set_device
(
int
(
cuda_visible_devices
))
## for DCU K100_AI, 通过 torch.cuda.get_device_name() 获取 device_name
if
"K500SM_AI"
in
torch
.
cuda
.
get_device_name
():
print
(
"Init DCU Devices"
)
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
os
.
getenv
(
"HIP_VISIBLE_DEVICES"
)
os
.
environ
[
"LOCAL_RANK"
]
=
os
.
getenv
(
"RAY_LOCAL_RANK"
)
cuda_visible_devices
=
os
.
getenv
(
"LOCAL_RANK"
,
"0"
)
torch
.
cuda
.
set_device
(
int
(
cuda_visible_devices
))
master_addr
=
os
.
getenv
(
"MASTER_ADDR"
)
master_addr
=
os
.
getenv
(
"MASTER_ADDR"
)
master_port
=
os
.
getenv
(
"MASTER_PORT"
)
master_port
=
os
.
getenv
(
"MASTER_PORT"
)
...
...
verl/workers/fsdp_workers.py
View file @
3d98a379
...
@@ -15,6 +15,8 @@
...
@@ -15,6 +15,8 @@
The main entry point to run the PPO algorithm
The main entry point to run the PPO algorithm
"""
"""
import
os
from
typing
import
Literal
,
Optional
,
Union
from
typing
import
Literal
,
Optional
,
Union
import
numpy
as
np
import
numpy
as
np
...
@@ -71,7 +73,9 @@ class FSDPWorker(Worker):
...
@@ -71,7 +73,9 @@ class FSDPWorker(Worker):
self
.
role
=
role
self
.
role
=
role
if
not
dist
.
is_initialized
():
if
not
dist
.
is_initialized
():
self
.
print_rank0
(
"Initializing distributed process group..."
)
dist
.
init_process_group
(
backend
=
"nccl"
)
dist
.
init_process_group
(
backend
=
"nccl"
)
print
(
f
"!!! Rank
{
dist
.
get_rank
()
}
initialized successfully!"
)
# improve numerical stability
# improve numerical stability
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
...
@@ -265,7 +269,9 @@ class FSDPWorker(Worker):
...
@@ -265,7 +269,9 @@ class FSDPWorker(Worker):
# rank = torch.cuda.set_device(self.rank)
# rank = torch.cuda.set_device(self.rank)
# model = model.to(rank)
# model = model.to(rank)
print
(
f
"!!! local_rank=
{
self
.
rank
}
, torch.cuda.current_device()=
{
torch
.
cuda
.
current_device
()
}
"
)
local_rank
=
int
(
os
.
environ
[
"LOCAL_RANK"
])
print
(
f
"!!! rank=
{
self
.
rank
}
, local_rank=
{
local_rank
}
, torch.cuda.current_device()=
{
torch
.
cuda
.
current_device
()
}
"
)
print
(
f
"self.device_mesh =
{
self
.
device_mesh
}
"
)
self
.
fsdp_module
=
FSDP
(
self
.
fsdp_module
=
FSDP
(
model
,
model
,
sharding_strategy
=
sharding_strategy
,
sharding_strategy
=
sharding_strategy
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment