Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6c5b7af1
"vllm/vscode:/vscode.git/clone" did not exist on "8cdc3a30b21b06a793c896ced997436b8f1cb255"
Unverified
Commit
6c5b7af1
authored
Jun 20, 2024
by
youkaichao
Committed by
GitHub
Jun 20, 2024
Browse files
[distributed][misc] use fork by default for mp (#5669)
parent
8065a7e2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
3 deletions
+38
-3
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+9
-0
vllm/distributed/device_communicators/custom_all_reduce_utils.py
...stributed/device_communicators/custom_all_reduce_utils.py
+27
-1
vllm/envs.py
vllm/envs.py
+2
-2
No files found.
.buildkite/test-pipeline.yaml
View file @
6c5b7af1
...
...
@@ -37,6 +37,9 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
...
...
@@ -55,6 +58,9 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
...
...
@@ -145,6 +151,9 @@ steps:
num_gpus
:
4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands
:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s -x lora/test_long_context.py
-
label
:
Tensorizer Test
...
...
vllm/distributed/device_communicators/custom_all_reduce_utils.py
View file @
6c5b7af1
import
ctypes
import
json
import
os
import
pickle
import
subprocess
import
sys
from
itertools
import
product
from
typing
import
Dict
,
List
,
Optional
,
Sequence
...
...
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
ids
=
list
(
range
(
num_dev
))
# batch of all pairs of GPUs
batch_src
,
batch_tgt
=
zip
(
*
list
(
product
(
ids
,
ids
)))
result
=
can_actually_p2p
(
batch_src
,
batch_tgt
)
# NOTE: we use `subprocess` rather than `multiprocessing` here
# because the caller might not have `if __name__ == "__main__":`,
# in that case we cannot use spawn method in multiprocessing.
# However, `can_actually_p2p` requires spawn method.
# The fix is, we use `subprocess` to call the function,
# where we have `if __name__ == "__main__":` in this file.
input_bytes
=
pickle
.
dumps
((
batch_src
,
batch_tgt
))
returned
=
subprocess
.
run
([
sys
.
executable
,
__file__
],
input
=
input_bytes
,
capture_output
=
True
)
# check if the subprocess is successful
try
:
returned
.
check_returncode
()
except
Exception
as
e
:
# wrap raised exception to provide more information
raise
RuntimeError
(
f
"Error happened when batch testing "
f
"peer-to-peer access from
{
batch_src
}
to
{
batch_tgt
}
"
)
from
e
result
=
pickle
.
loads
(
returned
.
stdout
)
for
_i
,
_j
,
r
in
zip
(
batch_src
,
batch_tgt
,
result
):
cache
[
f
"
{
_i
}
->
{
_j
}
"
]
=
r
with
open
(
path
,
"w"
)
as
f
:
...
...
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
__all__
=
[
"gpu_p2p_access_check"
]
if
__name__
==
"__main__"
:
batch_src
,
batch_tgt
=
pickle
.
loads
(
sys
.
stdin
.
buffer
.
read
())
result
=
can_actually_p2p
(
batch_src
,
batch_tgt
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
result
))
vllm/envs.py
View file @
6c5b7af1
...
...
@@ -29,7 +29,7 @@ if TYPE_CHECKING:
VLLM_CPU_KVCACHE_SPACE
:
int
=
0
VLLM_XLA_CACHE_PATH
:
str
=
"~/.vllm/xla_cache/"
VLLM_USE_RAY_COMPILED_DAG
:
bool
=
False
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
spawn
"
VLLM_WORKER_MULTIPROC_METHOD
:
str
=
"
fork
"
VLLM_IMAGE_FETCH_TIMEOUT
:
int
=
5
VLLM_TARGET_DEVICE
:
str
=
"cuda"
MAX_JOBS
:
Optional
[
str
]
=
None
...
...
@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD"
:
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
spawn
"
),
lambda
:
os
.
getenv
(
"VLLM_WORKER_MULTIPROC_METHOD"
,
"
fork
"
),
# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment