Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6c4dbe23
Unverified
Commit
6c4dbe23
authored
Feb 11, 2025
by
ℍ𝕠𝕝𝕝𝕠𝕨 𝕄𝕒𝕟
Committed by
GitHub
Feb 12, 2025
Browse files
[BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962)
Signed-off-by:
Hollow Man
<
hollowman@opensuse.org
>
parent
21f5d50f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
9 additions
and
9 deletions
+9
-9
examples/offline_inference/rlhf.py
examples/offline_inference/rlhf.py
+1
-1
examples/offline_inference/rlhf_colocate.py
examples/offline_inference/rlhf_colocate.py
+1
-1
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+5
-5
tests/distributed/test_custom_all_reduce.py
tests/distributed/test_custom_all_reduce.py
+2
-2
No files found.
examples/offline_inference/rlhf.py
View file @
6c4dbe23
...
@@ -92,7 +92,7 @@ class MyLLM(LLM):
...
@@ -92,7 +92,7 @@ class MyLLM(LLM):
# a hack to make the script work.
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
# at the top-level
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
...
...
examples/offline_inference/rlhf_colocate.py
View file @
6c4dbe23
...
@@ -59,7 +59,7 @@ class MyLLM(LLM):
...
@@ -59,7 +59,7 @@ class MyLLM(LLM):
# a hack to make the script work.
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
# at the top-level
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
# every worker will use 0.4 GPU, so that we can schedule
# every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs.
# 2 instances on the same GPUs.
os
.
environ
[
"VLLM_RAY_PER_WORKER_GPUS"
]
=
"0.4"
os
.
environ
[
"VLLM_RAY_PER_WORKER_GPUS"
]
=
"0.4"
...
...
tests/distributed/test_comm_ops.py
View file @
6c4dbe23
...
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
# they will be able to set the device to the correct GPU
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
def
send_recv_tensor_dict_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
...
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
def
send_recv_test_worker
(
tp_size
:
int
,
pp_size
:
int
,
rank
:
int
,
distributed_init_port
:
str
):
distributed_init_port
:
str
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
tests/distributed/test_custom_all_reduce.py
View file @
6c4dbe23
...
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
...
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
graph_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
def
graph_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
...
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
eager_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
def
eager_allreduce
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
):
del
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
os
.
environ
.
pop
(
"CUDA_VISIBLE_DEVICES"
,
None
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment