Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6c5b7af1
Unverified
Commit
6c5b7af1
authored
Jun 20, 2024
by
youkaichao
Committed by
GitHub
Jun 20, 2024
Browse files
[distributed][misc] use fork by default for mp (#5669)
parent
8065a7e2
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
3 deletions
+38
-3
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+9
-0
vllm/distributed/device_communicators/custom_all_reduce_utils.py
...stributed/device_communicators/custom_all_reduce_utils.py
+27
-1
vllm/envs.py
vllm/envs.py
+2
-2
No files found.
.buildkite/test-pipeline.yaml
View file @
6c5b7af1
...
@@ -37,6 +37,9 @@ steps:
...
@@ -37,6 +37,9 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
num_gpus
:
2
commands
:
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
...
@@ -55,6 +58,9 @@ steps:
...
@@ -55,6 +58,9 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
num_gpus
:
4
commands
:
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
...
@@ -145,6 +151,9 @@ steps:
...
@@ -145,6 +151,9 @@ steps:
num_gpus
:
4
num_gpus
:
4
# This test runs llama 13B, so it is required to run on 4 GPUs.
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands
:
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s -x lora/test_long_context.py
-
pytest -v -s -x lora/test_long_context.py
-
label
:
Tensorizer Test
-
label
:
Tensorizer Test
...
...
vllm/distributed/device_communicators/custom_all_reduce_utils.py
View file @
6c5b7af1
import
ctypes
import
ctypes
import
json
import
json
import
os
import
os
import
pickle
import
subprocess
import
sys
from
itertools
import
product
from
itertools
import
product
from
typing
import
Dict
,
List
,
Optional
,
Sequence
from
typing
import
Dict
,
List
,
Optional
,
Sequence
...
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
...
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
ids
=
list
(
range
(
num_dev
))
ids
=
list
(
range
(
num_dev
))
# batch of all pairs of GPUs
# batch of all pairs of GPUs
batch_src
,
batch_tgt
=
zip
(
*
list
(
product
(
ids
,
ids
)))
batch_src
,
batch_tgt
=
zip
(
*
list
(
product
(
ids
,
ids
)))
result
=
can_actually_p2p
(
batch_src
,
batch_tgt
)
# NOTE: we use `subprocess` rather than `multiprocessing` here
# because the caller might not have `if __name__ == "__main__":`,
# in that case we cannot use spawn method in multiprocessing.
# However, `can_actually_p2p` requires spawn method.
# The fix is, we use `subprocess` to call the function,
# where we have `if __name__ == "__main__":` in this file.
input_bytes
=
pickle
.
dumps
((
batch_src
,
batch_tgt
))
returned
=
subprocess
.
run
([
sys
.
executable
,
__file__
],
input
=
input_bytes
,
capture_output
=
True
)
# check if the subprocess is successful
try
:
returned
.
check_returncode
()
except
Exception
as
e
:
# wrap raised exception to provide more information
raise
RuntimeError
(
f
"Error happened when batch testing "
f
"peer-to-peer access from
{
batch_src
}
to
{
batch_tgt
}
"
)
from
e
result
=
pickle
.
loads
(
returned
.
stdout
)
for
_i
,
_j
,
r
in
zip
(
batch_src
,
batch_tgt
,
result
):
for
_i
,
_j
,
r
in
zip
(
batch_src
,
batch_tgt
,
result
):
cache
[
f
"
{
_i
}
->
{
_j
}
"
]
=
r
cache
[
f
"
{
_i
}
->
{
_j
}
"
]
=
r
with
open
(
path
,
"w"
)
as
f
:
with
open
(
path
,
"w"
)
as
f
:
...
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
...
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
__all__
=
[
"gpu_p2p_access_check"
]
__all__
=
[
"gpu_p2p_access_check"
]
if
__name__
==
"__main__"
:
batch_src
,
batch_tgt
=
pickle
.
loads
(
sys
.
stdin
.
buffer
.
read
())
result
=
can_actually_p2p
(
batch_src
,
batch_tgt
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
result
))
vllm/envs.py
View file @
6c5b7af1
...
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
...
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_XLA_CACHE_PATH
:
str
=
"~/.vllm/xla_cache/"
VLLM_XLA_CACHE_PATH
:
str
=
"~/.vllm/xla_cache/"
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
spawn
"
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
fork
"
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_TARGET_DEVICE
:
str
=
"cuda"
VLLM_TARGET_DEVICE
:
str
=
"cuda"
MAX_JOBS
:
Optional
[
str
]
=
None
MAX_JOBS
:
Optional
[
str
]
=
None
...
@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Use dedicated multiprocess context for workers.
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD"
:
"VLLM_WORKER_MULTIPROC_METHOD"
:
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
spawn
"
),
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
fork
"
),
# Timeout for fetching images when serving multimodal models
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
# Default is 5 seconds
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment