Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5304b4ef
"docker/diffusers-doc-builder/Dockerfile" did not exist on "bcb476797ccb7523f3e114f7440b4c8d9bb7154b"
Unverified
Commit
5304b4ef
authored
Jul 06, 2024
by
Liangsheng Yin
Committed by
GitHub
Jul 06, 2024
Browse files
Add `--enable-p2p-check` option (#599)
parent
26908d95
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
13 additions
and
9 deletions
+13
-9
README.md
README.md
+1
-1
python/sglang/srt/managers/controller/model_runner.py
python/sglang/srt/managers/controller/model_runner.py
+4
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-0
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+2
-7
No files found.
README.md
View file @
5304b4ef
...
...
@@ -362,7 +362,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
```
### Additional Arguments
-
Add
`--tp 2`
to enable tensor parallelism.
-
Add
`--tp 2`
to enable tensor parallelism.
If it indicates
`peer access is not supported between these two devices`
, add
`--enable-p2p-check`
option.
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
```
...
...
python/sglang/srt/managers/controller/model_runner.py
View file @
5304b4ef
...
...
@@ -259,7 +259,10 @@ class ModelRunner:
logger
.
info
(
f
"[gpu_id=
{
self
.
gpu_id
}
] Set cuda device."
)
torch
.
cuda
.
set_device
(
self
.
gpu_id
)
logger
.
info
(
f
"[gpu_id=
{
self
.
gpu_id
}
] Init nccl begin."
)
monkey_patch_vllm_p2p_access_check
(
self
.
gpu_id
)
if
not
server_args
.
enable_p2p_check
:
monkey_patch_vllm_p2p_access_check
(
self
.
gpu_id
)
if
server_args
.
nccl_init_addr
:
nccl_init_method
=
f
"tcp://
{
server_args
.
nccl_init_addr
}
"
else
:
...
...
python/sglang/srt/server_args.py
View file @
5304b4ef
...
...
@@ -55,6 +55,7 @@ class ServerArgs:
disable_regex_jump_forward
:
bool
=
False
disable_disk_cache
:
bool
=
False
attention_reduce_in_fp32
:
bool
=
False
enable_p2p_check
:
bool
=
False
# Distributed args
nccl_init_addr
:
Optional
[
str
]
=
None
...
...
@@ -304,6 +305,11 @@ class ServerArgs:
help
=
"Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels"
,
)
parser
.
add_argument
(
"--enable-p2p-check"
,
action
=
"store_true"
,
help
=
"Enable P2P check for GPU access, otherwise the p2p access is allowed by default."
,
)
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
...
...
python/sglang/srt/utils.py
View file @
5304b4ef
...
...
@@ -458,13 +458,8 @@ def monkey_patch_vllm_p2p_access_check(gpu_id: int):
NOTE: We assume the p2p access is always allowed, which can be wrong for some setups.
"""
# TODO: need a better check than just dev str name match
# compat: skip RTX 40 series as they do not have P2P feature and even checking for them may cause errors
device_name
=
torch
.
cuda
.
get_device_name
(
gpu_id
)
if
"RTX 40"
not
in
device_name
:
import
vllm.distributed.device_communicators.custom_all_reduce_utils
as
tgt
setattr
(
tgt
,
"gpu_p2p_access_check"
,
lambda
*
arg
,
**
kwargs
:
True
)
import
vllm.distributed.device_communicators.custom_all_reduce_utils
as
tgt
setattr
(
tgt
,
"gpu_p2p_access_check"
,
lambda
*
arg
,
**
kwargs
:
True
)
def
monkey_patch_vllm_dummy_weight_loader
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment