Unverified Commit 6c5b7af1 authored by youkaichao's avatar youkaichao Committed by GitHub
Browse files

[distributed][misc] use fork by default for mp (#5669)

parent 8065a7e2
...@@ -37,6 +37,9 @@ steps: ...@@ -37,6 +37,9 @@ steps:
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
commands: commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
...@@ -55,6 +58,9 @@ steps: ...@@ -55,6 +58,9 @@ steps:
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
commands: commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
...@@ -145,6 +151,9 @@ steps: ...@@ -145,6 +151,9 @@ steps:
num_gpus: 4 num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs. # This test runs llama 13B, so it is required to run on 4 GPUs.
commands: commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py - pytest -v -s -x lora/test_long_context.py
- label: Tensorizer Test - label: Tensorizer Test
......
import ctypes import ctypes
import json import json
import os import os
import pickle
import subprocess
import sys
from itertools import product from itertools import product
from typing import Dict, List, Optional, Sequence from typing import Dict, List, Optional, Sequence
...@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ...@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
ids = list(range(num_dev)) ids = list(range(num_dev))
# batch of all pairs of GPUs # batch of all pairs of GPUs
batch_src, batch_tgt = zip(*list(product(ids, ids))) batch_src, batch_tgt = zip(*list(product(ids, ids)))
result = can_actually_p2p(batch_src, batch_tgt) # NOTE: we use `subprocess` rather than `multiprocessing` here
# because the caller might not have `if __name__ == "__main__":`,
# in that case we cannot use spawn method in multiprocessing.
# However, `can_actually_p2p` requires spawn method.
# The fix is, we use `subprocess` to call the function,
# where we have `if __name__ == "__main__":` in this file.
input_bytes = pickle.dumps((batch_src, batch_tgt))
returned = subprocess.run([sys.executable, __file__],
input=input_bytes,
capture_output=True)
# check if the subprocess is successful
try:
returned.check_returncode()
except Exception as e:
# wrap raised exception to provide more information
raise RuntimeError(
f"Error happened when batch testing "
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
result = pickle.loads(returned.stdout)
for _i, _j, r in zip(batch_src, batch_tgt, result): for _i, _j, r in zip(batch_src, batch_tgt, result):
cache[f"{_i}->{_j}"] = r cache[f"{_i}->{_j}"] = r
with open(path, "w") as f: with open(path, "w") as f:
...@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ...@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
__all__ = ["gpu_p2p_access_check"] __all__ = ["gpu_p2p_access_check"]
if __name__ == "__main__":
batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
result = can_actually_p2p(batch_src, batch_tgt)
sys.stdout.buffer.write(pickle.dumps(result))
...@@ -29,7 +29,7 @@ if TYPE_CHECKING: ...@@ -29,7 +29,7 @@ if TYPE_CHECKING:
VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None MAX_JOBS: Optional[str] = None
...@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Use dedicated multiprocess context for workers. # Use dedicated multiprocess context for workers.
# Both spawn and fork work # Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD": "VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
# Timeout for fetching images when serving multimodal models # Timeout for fetching images when serving multimodal models
# Default is 5 seconds # Default is 5 seconds
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment