Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f6662071
Unverified
Commit
f6662071
authored
Jul 02, 2024
by
youkaichao
Committed by
GitHub
Jul 02, 2024
Browse files
[misc][distributed] error on invalid state (#6092)
parent
d830656a
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
29 additions
and
1 deletion
+29
-1
vllm/executor/multiproc_gpu_executor.py
vllm/executor/multiproc_gpu_executor.py
+3
-0
vllm/executor/ray_gpu_executor.py
vllm/executor/ray_gpu_executor.py
+4
-1
vllm/utils.py
vllm/utils.py
+22
-0
No files found.
vllm/executor/multiproc_gpu_executor.py
View file @
f6662071
...
@@ -10,6 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
...
@@ -10,6 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
cuda_device_count_stateless
,
from
vllm.utils
import
(
cuda_device_count_stateless
,
error_on_invalid_device_count_status
,
get_distributed_init_method
,
get_open_port
,
get_distributed_init_method
,
get_open_port
,
get_vllm_instance_id
,
make_async
,
get_vllm_instance_id
,
make_async
,
update_environment_variables
)
update_environment_variables
)
...
@@ -39,6 +40,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -39,6 +40,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
assert
world_size
<=
cuda_device_count_stateless
(),
(
assert
world_size
<=
cuda_device_count_stateless
(),
(
"please set tensor_parallel_size to less than max local gpu count"
)
"please set tensor_parallel_size to less than max local gpu count"
)
error_on_invalid_device_count_status
()
# Multiprocessing-based executor does not support multi-node setting.
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
# 127.0.0.1 for communication.
...
...
vllm/executor/ray_gpu_executor.py
View file @
f6662071
...
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
...
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.executor.ray_utils
import
RayWorkerWrapper
,
ray
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.sequence
import
ExecuteModelRequest
,
SamplerOutput
from
vllm.utils
import
(
get_distributed_init_method
,
get_ip
,
get_open_port
,
from
vllm.utils
import
(
error_on_invalid_device_count_status
,
get_distributed_init_method
,
get_ip
,
get_open_port
,
get_vllm_instance_id
,
make_async
)
get_vllm_instance_id
,
make_async
)
if
ray
is
not
None
:
if
ray
is
not
None
:
...
@@ -175,6 +176,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
...
@@ -175,6 +176,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
distributed_init_method
=
get_distributed_init_method
(
distributed_init_method
=
get_distributed_init_method
(
driver_ip
,
get_open_port
())
driver_ip
,
get_open_port
())
error_on_invalid_device_count_status
()
# Initialize the actual workers inside worker wrapper.
# Initialize the actual workers inside worker wrapper.
init_worker_all_kwargs
=
[
init_worker_all_kwargs
=
[
self
.
_get_worker_kwargs
(
self
.
_get_worker_kwargs
(
...
...
vllm/utils.py
View file @
f6662071
import
argparse
import
argparse
import
asyncio
import
asyncio
import
contextlib
import
datetime
import
datetime
import
enum
import
enum
import
gc
import
gc
...
@@ -816,6 +817,27 @@ def cuda_device_count_stateless() -> int:
...
@@ -816,6 +817,27 @@ def cuda_device_count_stateless() -> int:
return
_cuda_device_count_stateless
(
envs
.
CUDA_VISIBLE_DEVICES
)
return
_cuda_device_count_stateless
(
envs
.
CUDA_VISIBLE_DEVICES
)
def
error_on_invalid_device_count_status
():
cache_entries
=
0
with
contextlib
.
suppress
(
Exception
):
# future pytorch will fix the issue, device_count will not be cached
# at that time, `.cache_info().currsize` will error out
cache_entries
=
torch
.
cuda
.
device_count
.
cache_info
().
currsize
if
cache_entries
!=
0
:
# the function is already called, and the result is cached
remembered
=
torch
.
cuda
.
device_count
()
current
=
cuda_device_count_stateless
()
if
remembered
>
current
:
raise
RuntimeError
(
"The number of CUDA devices has changed since the first "
"call to torch.cuda.device_count(). This is not allowed "
"and may result in undefined behavior. Please check out "
"https://github.com/vllm-project/vllm/issues/6056 to "
"find the first call to torch.cuda.device_count() "
"and defer it until the engine is up. Or you can set "
"CUDA_VISIBLE_DEVICES to the GPUs you want to use."
)
# NVML utils
# NVML utils
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
# all the related functions work on real physical device ids.
# all the related functions work on real physical device ids.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment