Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
fbd56002
Unverified
Commit
fbd56002
authored
Mar 09, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 09, 2025
Browse files
Auto balance CI tests (#4238)
parent
730d084f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
151 additions
and
84 deletions
+151
-84
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+2
-6
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+12
-3
test/lang/run_suite.py
test/lang/run_suite.py
+9
-1
test/srt/run_suite.py
test/srt/run_suite.py
+126
-73
test/srt/test_custom_allreduce.py
test/srt/test_custom_allreduce.py
+2
-1
No files found.
.github/workflows/pr-test.yml
View file @
fbd56002
...
...
@@ -95,7 +95,7 @@ jobs:
strategy
:
fail-fast
:
false
matrix
:
range
:
[
0-6
,
6-15
,
15-22
,
22-32
,
32-40
,
40-48
,
48-100
]
part
:
[
0
,
1
,
2
,
3
,
4
,
5
,
6
]
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
...
...
@@ -109,11 +109,8 @@ jobs:
-
name
:
Run test
timeout-minutes
:
30
run
:
|
RANGE=${{ matrix.range }}
range_begin=${RANGE%-*}
range_end=${RANGE#*-}
cd test/srt
python3 run_suite.py --suite per-commit --
range-begin ${range_begin} --range-end ${range_end}
python3 run_suite.py --suite per-commit --
auto-partition-id ${{ matrix.part }} --auto-partition-size 7
unit-test-backend-2-gpu
:
needs
:
filter
...
...
@@ -340,7 +337,6 @@ jobs:
python3 test_moe_eval_accuracy_large.py
finish
:
if
:
always()
needs
:
[
unit-test-frontend
,
unit-test-backend-1-gpu
,
unit-test-backend-2-gpu
,
performance-test-1-gpu-part-1
,
performance-test-1-gpu-part-2
,
performance-test-2-gpu
,
...
...
python/sglang/test/test_utils.py
View file @
fbd56002
...
...
@@ -446,22 +446,31 @@ def run_with_timeout(
return
ret_value
[
0
]
def
run_unittest_files
(
files
:
List
[
str
]
,
timeout_per_file
:
float
):
def
run_unittest_files
(
files
:
List
,
timeout_per_file
:
float
):
tic
=
time
.
time
()
success
=
True
for
filename
in
files
:
for
file
in
files
:
filename
,
estimated_time
=
file
.
name
,
file
.
estimated_time
process
=
None
def
run_one_file
(
filename
):
nonlocal
process
filename
=
os
.
path
.
join
(
os
.
getcwd
(),
filename
)
print
(
f
"
\n\n
Run:
\n
python3
{
filename
}
\n\n
"
,
flush
=
True
)
print
(
f
".
\n
.
\n
Begin:
\n
python3
{
filename
}
\n
.
\n
.
\n
"
,
flush
=
True
)
tic
=
time
.
time
()
process
=
subprocess
.
Popen
(
[
"python3"
,
filename
],
stdout
=
None
,
stderr
=
None
,
env
=
os
.
environ
)
process
.
wait
()
elapsed
=
time
.
time
()
-
tic
print
(
f
".
\n
.
\n
End:
\n
{
filename
=
}
,
{
elapsed
=
:.
0
f
}
,
{
estimated_time
=
}
\n
.
\n
.
\n
"
,
flush
=
True
,
)
return
process
.
returncode
try
:
...
...
test/lang/run_suite.py
View file @
fbd56002
import
argparse
import
glob
from
dataclasses
import
dataclass
from
sglang.test.test_utils
import
run_unittest_files
@
dataclass
class
TestFile
:
name
:
str
estimated_time
:
float
=
60
suites
=
{
"per-commit"
:
[
"test_srt_backend.py"
,
TestFile
(
"test_srt_backend.py"
)
,
# Skip this due to some OPENAI_API_KEY issues
# "test_openai_backend.py",
],
...
...
test/srt/run_suite.py
View file @
fbd56002
import
argparse
import
glob
from
dataclasses
import
dataclass
from
sglang.test.test_utils
import
run_unittest_files
@
dataclass
class
TestFile
:
name
:
str
estimated_time
:
float
=
60
suites
=
{
"per-commit"
:
[
"models/lora/test_lora.py"
,
"models/lora/test_lora_backend.py"
,
"models/lora/test_multi_lora_backend.py"
,
"models/test_embedding_models.py"
,
"models/test_generation_models.py"
,
"models/test_qwen_models.py"
,
"models/test_reward_models.py"
,
"test_gptqmodel_dynamic.py"
,
"models/test_gme_qwen_models.py"
,
"test_abort.py"
,
"test_chunked_prefill.py"
,
"test_custom_allreduce.py"
,
"test_double_sparsity.py"
,
"test_eagle_infer.py"
,
"test_embedding_openai_server.py"
,
"test_eval_accuracy_mini.py"
,
"test_gguf.py"
,
"test_input_embeddings.py"
,
"test_mla.py"
,
"test_mla_deepseek_v3.py"
,
"test_mla_flashinfer.py"
,
"test_mla_fp8.py"
,
"test_json_constrained.py"
,
"test_large_max_new_tokens.py"
,
"test_metrics.py"
,
"test_no_chunked_prefill.py"
,
"test_no_overlap_scheduler.py"
,
"test_openai_server.py"
,
"test_penalty.py"
,
"test_pytorch_sampling_backend.py"
,
"test_radix_attention.py"
,
"test_regex_constrained.py"
,
"test_release_memory_occupation.py"
,
"test_request_length_validation.py"
,
"test_retract_decode.py"
,
"test_server_args.py"
,
# Disabled temporarily
# "test_session_control.py",
"test_skip_tokenizer_init.py"
,
"test_srt_engine.py"
,
"test_srt_endpoint.py"
,
"test_torch_compile.py"
,
"test_torch_compile_moe.py"
,
"test_torch_native_attention_backend.py"
,
"test_torchao.py"
,
"test_triton_attention_kernels.py"
,
"test_triton_attention_backend.py"
,
"test_hidden_states.py"
,
"test_update_weights_from_disk.py"
,
"test_update_weights_from_tensor.py"
,
"test_vertex_endpoint.py"
,
"test_vision_chunked_prefill.py"
,
"test_vision_llm.py"
,
"test_vision_openai_server.py"
,
"test_w8a8_quantization.py"
,
"test_fp8_kernel.py"
,
"test_block_int8.py"
,
"test_int8_kernel.py"
,
"test_reasoning_content.py"
,
TestFile
(
"models/lora/test_lora.py"
,
76
),
TestFile
(
"models/lora/test_lora_backend.py"
,
420
),
TestFile
(
"models/lora/test_multi_lora_backend.py"
,
1
),
TestFile
(
"models/test_embedding_models.py"
,
119
),
TestFile
(
"models/test_generation_models.py"
,
103
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
72
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_custom_allreduce.py"
,
1
),
TestFile
(
"test_double_sparsity.py"
,
50
),
TestFile
(
"test_eagle_infer.py"
,
447
),
TestFile
(
"test_embedding_openai_server.py"
,
36
),
TestFile
(
"test_eval_accuracy_mini.py"
,
63
),
TestFile
(
"test_gguf.py"
,
78
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_mla.py"
,
92
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_mla_flashinfer.py"
,
395
),
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_openai_server.py"
,
124
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
167
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_release_memory_occupation.py"
,
44
),
TestFile
(
"test_request_length_validation.py"
,
31
),
TestFile
(
"test_retract_decode.py"
,
54
),
TestFile
(
"test_server_args.py"
,
1
),
TestFile
(
"test_skip_tokenizer_init.py"
,
72
),
TestFile
(
"test_srt_engine.py"
,
237
),
TestFile
(
"test_srt_endpoint.py"
,
94
),
TestFile
(
"test_torch_compile.py"
,
76
),
TestFile
(
"test_torch_compile_moe.py"
,
85
),
TestFile
(
"test_torch_native_attention_backend.py"
,
149
),
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
TestFile
(
"test_triton_attention_backend.py"
,
134
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_update_weights_from_disk.py"
,
114
),
TestFile
(
"test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_vertex_endpoint.py"
,
31
),
TestFile
(
"test_vision_chunked_prefill.py"
,
223
),
TestFile
(
"test_vision_llm.py"
,
18.4
),
TestFile
(
"test_vision_openai_server.py"
,
344
),
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"test_fp8_kernel.py"
,
2
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_int8_kernel.py"
,
1
),
TestFile
(
"test_reasoning_content.py"
,
89
),
],
"nightly"
:
[
"test_nightly_gsm8k_eval.py"
,
# Disable temporarily
# "test_nightly_math_eval.py",
TestFile
(
"test_nightly_gsm8k_eval.py"
),
],
}
# Expand suite
for
target_suite_name
,
target_tests
in
suites
.
items
():
for
suite_name
,
tests
in
suites
.
items
():
if
suite_name
==
target_suite_name
:
continue
if
target_suite_name
in
tests
:
tests
.
remove
(
target_suite_name
)
tests
.
extend
(
target_tests
)
def
auto_partition
(
files
,
rank
,
size
):
"""
Partition files into size sublists with approximately equal sums of estimated times
using stable sorting, and return the partition for the specified rank.
Args:
files (list): List of file objects with estimated_time attribute
rank (int): Index of the partition to return (0 to size-1)
size (int): Number of partitions
Returns:
list: List of file objects in the specified rank's partition
"""
weights
=
[
f
.
estimated_time
for
f
in
files
]
if
not
weights
or
size
<=
0
or
size
>
len
(
weights
):
return
[]
# Create list of (weight, original_index) tuples
# Using negative index as secondary key to maintain original order for equal weights
indexed_weights
=
[(
w
,
-
i
)
for
i
,
w
in
enumerate
(
weights
)]
# Stable sort in descending order by weight
# If weights are equal, larger (negative) index comes first (i.e., earlier original position)
indexed_weights
=
sorted
(
indexed_weights
,
reverse
=
True
)
# Extract original indices (negate back to positive)
indexed_weights
=
[(
w
,
-
i
)
for
w
,
i
in
indexed_weights
]
# Initialize partitions and their sums
partitions
=
[[]
for
_
in
range
(
size
)]
sums
=
[
0.0
]
*
size
# Greedy approach: assign each weight to partition with smallest current sum
for
weight
,
idx
in
indexed_weights
:
# Find partition with minimum sum
min_sum_idx
=
sums
.
index
(
min
(
sums
))
partitions
[
min_sum_idx
].
append
(
idx
)
sums
[
min_sum_idx
]
+=
weight
# Return the files corresponding to the indices in the specified rank's partition
indices
=
partitions
[
rank
]
return
[
files
[
i
]
for
i
in
indices
]
if
__name__
==
"__main__"
:
arg_parser
=
argparse
.
ArgumentParser
()
...
...
@@ -108,17 +148,30 @@ if __name__ == "__main__":
default
=
None
,
help
=
"The end index of the range of the files to run."
,
)
arg_parser
.
add_argument
(
"--auto-partition-id"
,
type
=
int
,
help
=
"Use auto load balancing. The part id."
,
)
arg_parser
.
add_argument
(
"--auto-partition-size"
,
type
=
int
,
help
=
"Use auto load balancing. The number of parts."
,
)
args
=
arg_parser
.
parse_args
()
print
(
f
"
{
args
=
}
"
)
if
args
.
suite
==
"all"
:
files
=
glob
.
glob
(
"**/test_*.py"
,
recursive
=
True
)
else
:
files
=
suites
[
args
.
suite
]
files
=
files
[
args
.
range_begin
:
args
.
range_end
]
if
args
.
auto_partition_size
:
files
=
auto_partition
(
files
,
args
.
auto_partition_id
,
args
.
auto_partition_size
)
else
:
files
=
files
[
args
.
range_begin
:
args
.
range_end
]
print
(
f
"
{
args
=
}
"
)
print
(
"The running tests are "
,
files
)
print
(
"The running tests are "
,
[
f
.
name
for
f
in
files
])
exit_code
=
run_unittest_files
(
files
,
args
.
timeout_per_file
)
exit
(
exit_code
)
test/srt/test_custom_allreduce.py
View file @
fbd56002
...
...
@@ -42,7 +42,8 @@ def multi_process_parallel(
# as compared to multiprocessing.
# NOTE: We need to set working_dir for distributed tests,
# otherwise we may get import errors on ray workers
ray
.
init
(
log_to_driver
=
False
)
ray
.
init
(
log_to_driver
=
True
)
distributed_init_port
=
get_open_port
()
refs
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment