Unverified Commit 5304b4ef authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Add `--enable-p2p-check` option (#599)

parent 26908d95
...@@ -362,7 +362,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port ...@@ -362,7 +362,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
``` ```
### Additional Arguments ### Additional Arguments
- Add `--tp 2` to enable tensor parallelism. - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
``` ```
......
...@@ -259,7 +259,10 @@ class ModelRunner: ...@@ -259,7 +259,10 @@ class ModelRunner:
logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.") logger.info(f"[gpu_id={self.gpu_id}] Set cuda device.")
torch.cuda.set_device(self.gpu_id) torch.cuda.set_device(self.gpu_id)
logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.")
if not server_args.enable_p2p_check:
monkey_patch_vllm_p2p_access_check(self.gpu_id) monkey_patch_vllm_p2p_access_check(self.gpu_id)
if server_args.nccl_init_addr: if server_args.nccl_init_addr:
nccl_init_method = f"tcp://{server_args.nccl_init_addr}" nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
else: else:
......
...@@ -55,6 +55,7 @@ class ServerArgs: ...@@ -55,6 +55,7 @@ class ServerArgs:
disable_regex_jump_forward: bool = False disable_regex_jump_forward: bool = False
disable_disk_cache: bool = False disable_disk_cache: bool = False
attention_reduce_in_fp32: bool = False attention_reduce_in_fp32: bool = False
enable_p2p_check: bool = False
# Distributed args # Distributed args
nccl_init_addr: Optional[str] = None nccl_init_addr: Optional[str] = None
...@@ -304,6 +305,11 @@ class ServerArgs: ...@@ -304,6 +305,11 @@ class ServerArgs:
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels", "This only affects Triton attention kernels",
) )
parser.add_argument(
"--enable-p2p-check",
action="store_true",
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
)
@classmethod @classmethod
def from_cli_args(cls, args: argparse.Namespace): def from_cli_args(cls, args: argparse.Namespace):
......
...@@ -458,12 +458,7 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int): ...@@ -458,12 +458,7 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups. NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
""" """
# TODO: need a better check than just dev str name match
# compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
device_name = torch.cuda.get_device_name(gpu_id)
if "RTX 40" not in device_name:
import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True) setattr(tgt, "gpu_p2p_access_check", lambda *arg, **kwargs: True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment