Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
35ca04d2
Unverified
Commit
35ca04d2
authored
Apr 27, 2025
by
Lianmin Zheng
Committed by
GitHub
Apr 27, 2025
Browse files
[CI] fix port conflicts (#5789)
parent
3c4e0ee6
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
55 additions
and
51 deletions
+55
-51
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+3
-3
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+4
-7
python/sglang/srt/entrypoints/http_server.py
python/sglang/srt/entrypoints/http_server.py
+3
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+0
-1
test/srt/run_suite.py
test/srt/run_suite.py
+25
-25
test/srt/test_torch_compile_moe.py
test/srt/test_torch_compile_moe.py
+1
-1
test/srt/test_update_weights_from_distributed.py
test/srt/test_update_weights_from_distributed.py
+19
-13
No files found.
.github/workflows/pr-test.yml
View file @
35ca04d2
...
...
@@ -54,7 +54,7 @@ jobs:
strategy
:
fail-fast
:
false
matrix
:
part
:
[
0
,
1
,
2
,
3
,
4
,
5
,
6
]
part
:
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
]
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
...
...
@@ -64,10 +64,10 @@ jobs:
bash scripts/ci_install_dependency.sh
-
name
:
Run test
timeout-minutes
:
4
0
timeout-minutes
:
3
0
run
:
|
cd test/srt
python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size
7
python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size
8
unit-test-backend-2-gpu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...
...
python/sglang/bench_serving.py
View file @
35ca04d2
...
...
@@ -977,6 +977,7 @@ async def benchmark(
profile
:
bool
,
pd_seperated
:
bool
=
False
,
flush_cache
:
bool
=
False
,
warmup_requests
:
int
=
1
,
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
...
@@ -993,10 +994,8 @@ async def benchmark(
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
if
not
hasattr
(
args
,
"warmup_requests"
):
args
.
warmup_requests
=
1
# Warmup
print
(
f
"Starting warmup with
{
args
.
warmup_requests
}
sequences..."
)
print
(
f
"Starting warmup with
{
warmup_requests
}
sequences..."
)
# Use the first request for all warmup iterations
test_prompt
,
test_prompt_len
,
test_output_len
=
input_requests
[
0
]
...
...
@@ -1018,7 +1017,7 @@ async def benchmark(
# Run warmup requests
warmup_tasks
=
[]
for
_
in
range
(
args
.
warmup_requests
):
for
_
in
range
(
warmup_requests
):
warmup_tasks
.
append
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
test_input
))
)
...
...
@@ -1026,9 +1025,7 @@ async def benchmark(
warmup_outputs
=
await
asyncio
.
gather
(
*
warmup_tasks
)
# Check if at least one warmup request succeeded
if
args
.
warmup_requests
>
0
and
not
any
(
output
.
success
for
output
in
warmup_outputs
):
if
warmup_requests
>
0
and
not
any
(
output
.
success
for
output
in
warmup_outputs
):
raise
ValueError
(
"Warmup failed - Please make sure benchmark arguments "
f
"are correctly specified. Error:
{
warmup_outputs
[
0
].
error
}
"
...
...
python/sglang/srt/entrypoints/http_server.py
View file @
35ca04d2
...
...
@@ -281,7 +281,9 @@ async def generate_from_file_request(file: UploadFile, request: Request):
)
try
:
ret
=
await
_global_state
.
generate_request
(
obj
,
request
).
__anext__
()
ret
=
await
_global_state
.
tokenizer_manager
.
generate_request
(
obj
,
request
).
__anext__
()
return
ret
except
ValueError
as
e
:
logger
.
error
(
f
"Error:
{
e
}
"
)
...
...
python/sglang/test/test_utils.py
View file @
35ca04d2
...
...
@@ -8,7 +8,6 @@ import random
import
subprocess
import
threading
import
time
import
traceback
import
unittest
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
dataclass
...
...
test/srt/run_suite.py
View file @
35ca04d2
...
...
@@ -14,7 +14,7 @@ class TestFile:
suites
=
{
"per-commit"
:
[
TestFile
(
"models/lora/test_lora.py"
,
76
),
TestFile
(
"models/lora/test_lora_backend.py"
,
420
),
TestFile
(
"models/lora/test_lora_backend.py"
,
99
),
TestFile
(
"models/lora/test_multi_lora_backend.py"
,
60
),
TestFile
(
"models/test_embedding_models.py"
,
35
),
TestFile
(
"models/test_generation_models.py"
,
103
),
...
...
@@ -23,30 +23,30 @@ suites = {
TestFile
(
"models/test_compressed_tensors_models.py"
,
100
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"models/test_clip_models.py"
,
100
),
TestFile
(
"models/test_vlm_models.py"
,
100
),
TestFile
(
"models/test_clip_models.py"
,
52
),
TestFile
(
"models/test_vlm_models.py"
,
581
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_eagle_infer.py"
,
5
00
),
TestFile
(
"test_chunked_prefill.py"
,
285
),
TestFile
(
"test_eagle_infer.py"
,
5
84
),
TestFile
(
"test_ebnf_constrained.py"
),
TestFile
(
"test_fa3.py"
,
400
),
TestFile
(
"test_fa3.py"
,
376
),
TestFile
(
"test_fp8_kernel.py"
,
8
),
TestFile
(
"test_embedding_openai_server.py"
,
36
),
TestFile
(
"test_embedding_openai_server.py"
,
141
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_int8_kernel.py"
,
8
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_mla.py"
,
16
2
),
TestFile
(
"test_mla.py"
,
24
2
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_mla_int8_deepseek_v3.py"
,
522
),
TestFile
(
"test_mla_int8_deepseek_v3.py"
,
674
),
TestFile
(
"test_mla_flashinfer.py"
,
395
),
TestFile
(
"test_mla_fp8.py"
,
9
3
),
TestFile
(
"test_mla_fp8.py"
,
15
3
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_openai_server.py"
,
1
86
),
TestFile
(
"test_openai_server.py"
,
1
49
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_page_size.py"
,
60
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
...
...
@@ -57,11 +57,11 @@ suites = {
TestFile
(
"test_request_length_validation.py"
,
31
),
TestFile
(
"test_retract_decode.py"
,
54
),
TestFile
(
"test_server_args.py"
,
1
),
TestFile
(
"test_skip_tokenizer_init.py"
,
7
2
),
TestFile
(
"test_skip_tokenizer_init.py"
,
11
7
),
TestFile
(
"test_srt_engine.py"
,
237
),
TestFile
(
"test_srt_endpoint.py"
,
94
),
TestFile
(
"test_torch_compile.py"
,
76
),
TestFile
(
"test_torch_compile_moe.py"
,
8
5
),
TestFile
(
"test_torch_compile_moe.py"
,
23
5
),
TestFile
(
"test_torch_native_attention_backend.py"
,
123
),
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
...
...
@@ -69,27 +69,27 @@ suites = {
TestFile
(
"test_update_weights_from_disk.py"
,
114
),
TestFile
(
"test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_vertex_endpoint.py"
,
31
),
TestFile
(
"test_vision_chunked_prefill.py"
,
9
9
),
TestFile
(
"test_vision_chunked_prefill.py"
,
11
9
),
TestFile
(
"test_vlm_accuracy.py"
,
60
),
TestFile
(
"test_vision_openai_server.py"
,
5
37
),
TestFile
(
"test_vision_openai_server.py"
,
6
37
),
TestFile
(
"test_fim_completion.py"
,
40
),
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"test_eval_fp8_accuracy.py"
,
303
),
TestFile
(
"test_create_kvindices.py"
,
2
),
TestFile
(
"test_hicache.py"
,
6
0
),
TestFile
(
"test_hicache_mla.py"
,
90
),
TestFile
(
"test_hicache.py"
,
11
6
),
TestFile
(
"test_hicache_mla.py"
,
254
),
TestFile
(
"test_fused_moe.py"
,
30
),
TestFile
(
"test_triton_moe_channel_fp8_kernel.py"
,
25
),
],
"per-commit-2-gpu"
:
[
TestFile
(
"models/lora/test_lora_tp.py"
,
1
50
),
TestFile
(
"test_data_parallelism.py"
,
90
),
TestFile
(
"test_dp_attention.py"
,
1
50
),
TestFile
(
"test_mla_tp.py"
,
17
4
),
TestFile
(
"test_moe_ep.py"
,
220
),
TestFile
(
"test_patch_torch.py"
,
30
),
TestFile
(
"test_update_weights_from_distributed.py"
,
10
0
),
TestFile
(
"test_verl_engine.py"
,
100
),
TestFile
(
"models/lora/test_lora_tp.py"
,
1
16
),
TestFile
(
"test_data_parallelism.py"
,
73
),
TestFile
(
"test_dp_attention.py"
,
1
37
),
TestFile
(
"test_mla_tp.py"
,
17
0
),
TestFile
(
"test_moe_ep.py"
,
181
),
TestFile
(
"test_patch_torch.py"
,
19
),
TestFile
(
"test_update_weights_from_distributed.py"
,
10
3
),
TestFile
(
"test_verl_engine.py"
,
64
),
],
"per-commit-8-gpu"
:
[
TestFile
(
"test_local_attn.py"
,
250
),
...
...
test/srt/test_torch_compile_moe.py
View file @
35ca04d2
...
...
@@ -24,7 +24,7 @@ class TestTorchCompileMoe(CustomTestCase):
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"
8
"
],
other_args
=
[
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"
4
"
],
)
@
classmethod
...
...
test/srt/test_update_weights_from_distributed.py
View file @
35ca04d2
...
...
@@ -129,7 +129,7 @@ def init_process_hf(
hf_instruct_params
=
[]
hf_base_params
=
[]
print
(
"get parameter in hf instruct model and base model"
)
print
(
"
[hf]
get parameter in hf instruct model and base model"
)
for
parameter_name
in
checking_parameters
:
hf_instruct_params
.
append
(
hf_instruct_model
.
get_parameter
(
parameter_name
)[:
truncate_size
]
...
...
@@ -152,10 +152,12 @@ def init_process_hf(
param_queue
.
put
((
"hf_base_params"
,
hf_base_params
))
# Init weight update group for rank 0 (the training engine in RLHF).
print
(
f
"rank
{
rank
}
world_size:
{
world_size
}
init custom process group"
)
port
=
60000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
init_method
=
f
"tcp://localhost:
{
port
}
"
print
(
f
"[hf]
{
rank
=
}
{
world_size
=
}
init custom process group.
{
init_method
=
}
"
)
group
=
init_custom_process_group
(
backend
=
"nccl"
,
init_method
=
"tcp://localhost:65500"
,
init_method
=
init_method
,
world_size
=
world_size
,
rank
=
rank
,
group_name
=
"test_parameter_update_group"
,
...
...
@@ -184,7 +186,7 @@ def init_process_hf(
# Measure the latency of broadcasting/weights update.
broadcast_time
=
time_end_broadcast
-
time_begin_broadcast
print
(
f
"
rank
{
rank
}
broadcast parameter time:
{
broadcast_time
:.
3
f
}
s"
)
print
(
f
"
[hf]
{
rank
=
}
{
broadcast_time
=
:.
3
f
}
s"
)
param_queue
.
put
((
"broadcast_time"
,
broadcast_time
))
# Delete the huggingface models to free up memory.
...
...
@@ -210,17 +212,21 @@ def init_process_sgl(
torch
.
cuda
.
synchronize
()
base_gpu_id
=
1
if
rank
==
1
else
1
+
tp_size
if
backend
==
"Engine"
:
print
(
f
"[sgl] rank
{
rank
}
init engine"
)
engine
=
sgl
.
Engine
(
model_path
=
model_name
,
random_seed
=
42
,
base_gpu_id
=
base_gpu_id
,
tp_size
=
tp_size
,
cuda_graph_max_bs
=
2
,
)
else
:
if
rank
==
1
:
url
=
DEFAULT_URL_FOR_TEST
else
:
url
=
DEFAULT_URL_FOR_TEST
.
replace
(
"2157"
,
"2159"
)
host
,
port
=
DEFAULT_URL_FOR_TEST
.
split
(
":"
)
url
=
":"
.
join
(
host
,
str
(
int
(
port
)
+
10000
))
print
(
f
"[sgl] rank
{
rank
}
init server on url:
{
url
}
"
)
process
=
popen_launch_server
(
model_name
,
url
,
...
...
@@ -230,13 +236,11 @@ def init_process_sgl(
str
(
base_gpu_id
),
"--tp-size"
,
str
(
tp_size
),
"--cuda-graph-max-bs"
,
2
,
),
)
torch
.
cuda
.
synchronize
()
if
backend
==
"Engine"
:
print
(
f
"rank
{
rank
}
init engine"
)
else
:
print
(
f
"rank
{
rank
}
init server on url:
{
url
}
"
)
# Get weights of instruct model, i.e. pre-training weights.
instruct_params
=
[]
...
...
@@ -252,11 +256,13 @@ def init_process_sgl(
param_queue
.
put
((
f
"sgl_dp_
{
rank
}
_instruct_params"
,
instruct_params
))
port
=
60000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
# Init weight update group with the training engine.
if
backend
==
"Engine"
:
engine
.
init_weights_update_group
(
master_address
=
"localhost"
,
master_port
=
"65500"
,
master_port
=
str
(
port
)
,
rank_offset
=
base_gpu_id
,
world_size
=
world_size
,
group_name
=
"test_parameter_update_group"
,
...
...
@@ -267,7 +273,7 @@ def init_process_sgl(
f
"
{
url
}
/init_weights_update_group"
,
json
=
{
"master_address"
:
"localhost"
,
"master_port"
:
"65500"
,
"master_port"
:
str
(
port
)
,
"rank_offset"
:
base_gpu_id
,
"world_size"
:
world_size
,
"group_name"
:
"test_parameter_update_group"
,
...
...
@@ -311,7 +317,7 @@ def init_process_sgl(
# Measure the latency of broadcast/weights update.
update_time
=
time_end_update
-
time_begin_update
print
(
f
"fully update model_name
{
model_name
}
rank
{
rank
}
parameter from distributed time:
{
update_time
:.
3
f
}
s"
f
"
[sgl]
fully update model_name
{
model_name
}
rank
{
rank
}
parameter from distributed time:
{
update_time
:.
3
f
}
s"
)
param_queue
.
put
((
f
"update_sgl_dp_
{
rank
}
_time"
,
update_time
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment