Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
548a57b1
Unverified
Commit
548a57b1
authored
Oct 12, 2025
by
Lianmin Zheng
Committed by
GitHub
Oct 12, 2025
Browse files
Fix port conflicts in CI (#11497)
parent
88e73ed0
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
34 additions
and
336 deletions
+34
-336
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+2
-2
python/pyproject.toml
python/pyproject.toml
+5
-4
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+2
-5
scripts/sort_testcases_alphabetically.py
scripts/sort_testcases_alphabetically.py
+1
-314
test/srt/run_suite.py
test/srt/run_suite.py
+1
-1
test/srt/test_mla_int8_deepseek_v3.py
test/srt/test_mla_int8_deepseek_v3.py
+23
-10
No files found.
.github/workflows/pr-test.yml
View file @
548a57b1
...
@@ -327,7 +327,7 @@ jobs:
...
@@ -327,7 +327,7 @@ jobs:
strategy
:
strategy
:
fail-fast
:
false
fail-fast
:
false
matrix
:
matrix
:
part
:
[
0
,
1
,
2
]
part
:
[
0
,
1
]
steps
:
steps
:
-
name
:
Checkout code
-
name
:
Checkout code
uses
:
actions/checkout@v4
uses
:
actions/checkout@v4
...
@@ -348,7 +348,7 @@ jobs:
...
@@ -348,7 +348,7 @@ jobs:
timeout-minutes
:
20
timeout-minutes
:
20
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size
3
python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size
2
performance-test-1-gpu-part-1
:
performance-test-1-gpu-part-1
:
needs
:
[
check-changes
,
sgl-kernel-build-wheels
]
needs
:
[
check-changes
,
sgl-kernel-build-wheels
]
...
...
python/pyproject.toml
View file @
548a57b1
...
@@ -76,6 +76,7 @@ decord = ["decord"]
...
@@ -76,6 +76,7 @@ decord = ["decord"]
test
=
[
test
=
[
"accelerate"
,
"accelerate"
,
"expecttest"
,
"expecttest"
,
"gguf"
,
"jsonlines"
,
"jsonlines"
,
"matplotlib"
,
"matplotlib"
,
"pandas"
,
"pandas"
,
...
@@ -85,10 +86,10 @@ test = [
...
@@ -85,10 +86,10 @@ test = [
"tabulate"
,
"tabulate"
,
]
]
tracing
=
[
tracing
=
[
"opentelemetry-api"
,
"opentelemetry-api"
,
"opentelemetry-exporter-otlp"
,
"opentelemetry-exporter-otlp"
,
"opentelemetry-exporter-otlp-proto-grpc"
,
"opentelemetry-exporter-otlp-proto-grpc"
,
"opentelemetry-sdk"
,
"opentelemetry-sdk"
,
]
]
all
=
["sglang[test]
", "
sglang
[decord]"]
all
=
["sglang[test]
", "
sglang
[decord]"]
all_aarch64
=
["sglang[test]"]
all_aarch64
=
["sglang[test]"]
...
...
python/sglang/test/test_utils.py
View file @
548a57b1
...
@@ -135,11 +135,11 @@ def _use_cached_default_models(model_repo: str):
...
@@ -135,11 +135,11 @@ def _use_cached_default_models(model_repo: str):
if
is_in_ci
():
if
is_in_ci
():
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
(
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
(
5
000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
10
000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
0
)
)
else
:
else
:
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
(
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
=
(
7
000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
20
000
+
int
(
os
.
environ
.
get
(
"CUDA_VISIBLE_DEVICES"
,
"0"
)[
0
])
*
100
0
)
)
DEFAULT_URL_FOR_TEST
=
f
"http://127.0.0.1:
{
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
+
1000
}
"
DEFAULT_URL_FOR_TEST
=
f
"http://127.0.0.1:
{
DEFAULT_PORT_FOR_SRT_TEST_RUNNER
+
1000
}
"
...
@@ -396,8 +396,6 @@ def _get_call_generate(args: argparse.Namespace):
...
@@ -396,8 +396,6 @@ def _get_call_generate(args: argparse.Namespace):
return
partial
(
call_generate_vllm
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
return
partial
(
call_generate_vllm
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
elif
args
.
backend
==
"srt-raw"
:
elif
args
.
backend
==
"srt-raw"
:
return
partial
(
call_generate_srt_raw
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
return
partial
(
call_generate_srt_raw
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
elif
args
.
backend
==
"gserver"
:
return
partial
(
call_generate_gserver
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
)
elif
args
.
backend
==
"outlines"
:
elif
args
.
backend
==
"outlines"
:
return
partial
(
call_generate_outlines
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
return
partial
(
call_generate_outlines
,
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
/generate"
)
elif
args
.
backend
==
"guidance"
:
elif
args
.
backend
==
"guidance"
:
...
@@ -521,7 +519,6 @@ def popen_launch_server(
...
@@ -521,7 +519,6 @@ def popen_launch_server(
# Auto-detect device if needed
# Auto-detect device if needed
if
device
==
"auto"
:
if
device
==
"auto"
:
device
=
auto_config_device
()
device
=
auto_config_device
()
print
(
f
"Auto-configed device:
{
device
}
"
,
flush
=
True
)
other_args
=
list
(
other_args
)
other_args
=
list
(
other_args
)
other_args
+=
[
"--device"
,
str
(
device
)]
other_args
+=
[
"--device"
,
str
(
device
)]
...
...
scripts/sort_testcases_alphabetically.py
View file @
548a57b1
...
@@ -11,320 +11,7 @@ class TestFile:
...
@@ -11,320 +11,7 @@ class TestFile:
estimated_time
:
float
=
60
estimated_time
:
float
=
60
suites
=
{
suites
=
{}
"per-commit"
:
[
TestFile
(
"function_call/test_json_schema_constraint.py"
,
30
),
TestFile
(
"hicache/test_hicache.py"
,
116
),
TestFile
(
"hicache/test_hicache_eagle.py"
,
150
),
TestFile
(
"hicache/test_hicache_mla.py"
,
127
),
TestFile
(
"hicache/test_hicache_storage.py"
,
127
),
TestFile
(
"lora/test_lora.py"
,
200
),
TestFile
(
"lora/test_lora_backend.py"
,
99
),
TestFile
(
"lora/test_lora_eviction.py"
,
200
),
TestFile
(
"lora/test_lora_qwen3.py"
,
97
),
TestFile
(
"lora/test_lora_radix_cache.py"
,
100
),
TestFile
(
"lora/test_lora_update.py"
,
400
),
TestFile
(
"lora/test_multi_lora_backend.py"
,
60
),
TestFile
(
"models/test_embedding_models.py"
,
73
),
TestFile
(
"models/test_encoder_embedding_models.py"
,
100
),
TestFile
(
"models/test_cross_encoder_models.py"
,
100
),
TestFile
(
"models/test_compressed_tensors_models.py"
,
42
),
TestFile
(
"models/test_generation_models.py"
,
103
),
TestFile
(
"models/test_nvidia_nemotron_nano_v2.py"
,
180
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_reward_models.py"
,
132
),
TestFile
(
"models/test_transformers_models.py"
,
320
),
TestFile
(
"models/test_vlm_models.py"
,
741
),
TestFile
(
"openai_server/basic/test_protocol.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_chat.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_completions.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_embedding.py"
,
10
),
TestFile
(
"openai_server/basic/test_openai_embedding.py"
,
141
),
TestFile
(
"openai_server/basic/test_openai_server.py"
,
149
),
TestFile
(
"openai_server/features/test_enable_thinking.py"
,
70
),
TestFile
(
"openai_server/features/test_json_constrained.py"
,
98
),
TestFile
(
"openai_server/features/test_json_mode.py"
,
90
),
TestFile
(
"openai_server/features/test_openai_server_ebnf.py"
,
95
),
TestFile
(
"openai_server/features/test_openai_server_hidden_states.py"
,
240
),
TestFile
(
"openai_server/features/test_reasoning_content.py"
,
89
),
TestFile
(
"openai_server/function_call/test_openai_function_calling.py"
,
60
),
TestFile
(
"openai_server/function_call/test_tool_choice.py"
,
226
),
TestFile
(
"openai_server/validation/test_large_max_new_tokens.py"
,
41
),
TestFile
(
"openai_server/validation/test_matched_stop.py"
,
60
),
TestFile
(
"openai_server/validation/test_openai_server_ignore_eos.py"
,
85
),
TestFile
(
"openai_server/validation/test_request_length_validation.py"
,
31
),
TestFile
(
"quant/test_block_int8.py"
,
22
),
TestFile
(
"quant/test_fp8_kernel.py"
,
8
),
TestFile
(
"quant/test_int8_kernel.py"
,
8
),
TestFile
(
"quant/test_triton_scaled_mm.py"
,
8
),
TestFile
(
"quant/test_w8a8_quantization.py"
,
46
),
TestFile
(
"rl/test_fp32_lm_head.py"
,
30
),
TestFile
(
"rl/test_update_weights_from_disk.py"
,
114
),
TestFile
(
"rl/test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_create_kvindices.py"
,
2
),
TestFile
(
"test_chunked_prefill.py"
,
313
),
TestFile
(
"test_deterministic.py"
,
300
),
TestFile
(
"test_eagle_infer_a.py"
,
370
),
TestFile
(
"test_eagle_infer_b.py"
,
700
),
TestFile
(
"test_eagle_infer_beta.py"
,
300
),
TestFile
(
"test_ebnf_constrained.py"
,
108
),
TestFile
(
"test_eval_fp8_accuracy.py"
,
303
),
TestFile
(
"test_fa3.py"
,
376
),
# TestFile("test_flashmla.py", 352),
TestFile
(
"test_function_call_parser.py"
,
10
),
TestFile
(
"test_fused_moe.py"
,
30
),
TestFile
(
"test_gpt_oss_1gpu.py"
,
600
),
TestFile
(
"test_harmony_parser.py"
,
20
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_hybrid_attn_backend.py"
,
100
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_io_struct.py"
,
8
),
TestFile
(
"test_jinja_template_utils.py"
,
1
),
TestFile
(
"test_logprobs.py"
,
55
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_metrics_utils.py"
,
1
),
TestFile
(
"test_mla.py"
,
167
),
TestFile
(
"test_mla_deepseek_v3.py"
,
500
),
TestFile
(
"test_mla_int8_deepseek_v3.py"
,
429
),
TestFile
(
"test_mla_flashinfer.py"
,
302
),
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_modelopt_loader.py"
,
30
),
TestFile
(
"test_multi_tokenizer.py"
,
230
),
TestFile
(
"test_ngram_speculative_decoding.py"
,
250
),
TestFile
(
"test_no_chunked_prefill.py"
,
108
),
TestFile
(
"test_no_overlap_scheduler.py"
,
234
),
TestFile
(
"test_original_logprobs.py"
,
41
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_page_size.py"
,
60
),
TestFile
(
"test_priority_scheduling.py"
,
100
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
105
),
TestFile
(
"test_radix_cache_unit.py"
,
5
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_reasoning_parser.py"
,
5
),
TestFile
(
"test_retract_decode.py"
,
54
),
TestFile
(
"test_request_queue_validation.py"
,
30
),
TestFile
(
"test_score_api.py"
,
180
),
TestFile
(
"test_server_args.py"
,
1
),
TestFile
(
"test_skip_tokenizer_init.py"
,
117
),
TestFile
(
"test_srt_engine.py"
,
261
),
TestFile
(
"test_srt_endpoint.py"
,
130
),
TestFile
(
"test_standalone_speculative_decoding.py"
,
250
),
TestFile
(
"test_start_profile.py"
,
60
),
TestFile
(
"test_swa_unittest.py"
,
1
),
TestFile
(
"test_torch_compile.py"
,
76
),
TestFile
(
"test_torch_compile_moe.py"
,
172
),
TestFile
(
"test_torch_native_attention_backend.py"
,
123
),
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
TestFile
(
"test_triton_attention_backend.py"
,
150
),
TestFile
(
"test_triton_moe_channel_fp8_kernel.py"
,
25
),
TestFile
(
"test_triton_sliding_window.py"
,
250
),
TestFile
(
"test_utils_update_weights.py"
,
48
),
TestFile
(
"test_vision_chunked_prefill.py"
,
175
),
TestFile
(
"test_vlm_input_format.py"
,
300
),
TestFile
(
"test_vision_openai_server_a.py"
,
724
),
TestFile
(
"test_vision_openai_server_b.py"
,
446
),
],
"per-commit-2-gpu"
:
[
TestFile
(
"ep/test_moe_ep.py"
,
140
),
TestFile
(
"hicache/test_hicache_storage_file_backend.py"
,
200
),
TestFile
(
"hicache/test_hicache_storage_mooncake_backend.py"
,
400
),
TestFile
(
"hicache/test_hicache_storage_3fs_backend.py"
,
200
),
TestFile
(
"layers/attention/mamba/test_mamba2_mixer.py"
,
110
),
TestFile
(
"lora/test_lora_tp.py"
,
116
),
TestFile
(
"rl/test_update_weights_from_distributed.py"
,
103
),
TestFile
(
"test_data_parallelism.py"
,
73
),
TestFile
(
"test_disaggregation_basic.py"
,
400
),
TestFile
(
"test_dp_attention.py"
,
594
),
TestFile
(
"test_load_weights_from_remote_instance.py"
,
72
),
TestFile
(
"test_patch_torch.py"
,
19
),
TestFile
(
"test_release_memory_occupation.py"
,
257
),
],
"per-commit-4-gpu"
:
[
TestFile
(
"models/test_qwen3_next_models.py"
,
291
),
TestFile
(
"test_disaggregation_dp_attention.py"
,
155
),
TestFile
(
"test_gpt_oss_4gpu.py"
,
300
),
TestFile
(
"test_local_attn.py"
,
411
),
TestFile
(
"test_multi_instance_release_memory_occupation.py"
,
64
),
TestFile
(
"test_pp_single_node.py"
,
481
),
],
"per-commit-8-gpu"
:
[
TestFile
(
"lora/test_lora_llama4.py"
,
400
),
TestFile
(
"test_deepseek_v3_basic.py"
,
275
),
TestFile
(
"test_deepseek_v3_mtp.py"
,
275
),
TestFile
(
"test_disaggregation_different_tp.py"
,
600
),
TestFile
(
"test_disaggregation_pp.py"
,
140
),
],
"per-commit-4-gpu-b200"
:
[
# TestFile("test_gpt_oss_4gpu.py", 600),
# TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
],
"per-commit-4-gpu-deepep"
:
[
TestFile
(
"ep/test_deepep_small.py"
,
531
),
],
"per-commit-8-gpu-deepep"
:
[
TestFile
(
"ep/test_deepep_large.py"
,
338
),
],
"per-commit-8-gpu-h20"
:
[
TestFile
(
"quant/test_w4a8_deepseek_v3.py"
,
371
),
],
"vllm_dependency_test"
:
[
TestFile
(
"quant/test_awq.py"
,
163
),
TestFile
(
"test_bnb.py"
,
5
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
102
),
TestFile
(
"test_vllm_dependency.py"
,
185
),
# TestFile("test_gguf.py", 96),
],
}
# Add AMD tests
# NOTE: please sort the test cases alphabetically by the test file name
suite_amd
=
{
"per-commit-amd"
:
[
TestFile
(
"hicache/test_hicache.py"
,
116
),
TestFile
(
"hicache/test_hicache_mla.py"
,
127
),
TestFile
(
"hicache/test_hicache_storage.py"
,
127
),
TestFile
(
"lora/test_lora.py"
,
200
),
TestFile
(
"lora/test_lora_eviction.py"
,
200
),
TestFile
(
"lora/test_lora_backend.py"
,
99
),
TestFile
(
"lora/test_multi_lora_backend.py"
,
60
),
TestFile
(
"lora/test_lora_cuda_graph.py"
,
250
),
TestFile
(
"lora/test_lora_qwen3.py"
,
97
),
# TestFile("models/test_embedding_models.py", 73), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
TestFile
(
"models/test_compressed_tensors_models.py"
,
42
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_reward_models.py"
,
132
),
TestFile
(
"models/test_transformers_models.py"
,
320
),
TestFile
(
"openai_server/basic/test_protocol.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_chat.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_completions.py"
,
10
),
TestFile
(
"openai_server/basic/test_serving_embedding.py"
,
10
),
TestFile
(
"openai_server/basic/test_openai_embedding.py"
,
141
),
TestFile
(
"openai_server/basic/test_openai_server.py"
,
149
),
TestFile
(
"openai_server/features/test_enable_thinking.py"
,
70
),
TestFile
(
"openai_server/features/test_json_constrained.py"
,
98
),
TestFile
(
"openai_server/features/test_json_mode.py"
,
90
),
TestFile
(
"openai_server/features/test_openai_server_ebnf.py"
,
95
),
# TestFile("openai_server/features/test_openai_server_hidden_states.py", 240),
TestFile
(
"openai_server/features/test_reasoning_content.py"
,
89
),
TestFile
(
"openai_server/function_call/test_openai_function_calling.py"
,
60
),
TestFile
(
"openai_server/function_call/test_tool_choice.py"
,
226
),
TestFile
(
"function_call/test_json_schema_constraint.py"
,
30
),
TestFile
(
"openai_server/validation/test_large_max_new_tokens.py"
,
41
),
TestFile
(
"openai_server/validation/test_matched_stop.py"
,
60
),
TestFile
(
"openai_server/validation/test_openai_server_ignore_eos.py"
,
85
),
TestFile
(
"openai_server/validation/test_request_length_validation.py"
,
31
),
TestFile
(
"quant/test_block_int8.py"
,
22
),
TestFile
(
"quant/test_awq_dequant.py"
,
2
),
TestFile
(
"rl/test_update_weights_from_disk.py"
,
114
),
# TestFile("rl/test_update_weights_from_tensor.py", 48),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_create_kvindices.py"
,
2
),
TestFile
(
"test_chunked_prefill.py"
,
313
),
TestFile
(
"test_ebnf_constrained.py"
,
108
),
TestFile
(
"test_eval_fp8_accuracy.py"
,
303
),
TestFile
(
"test_function_call_parser.py"
,
10
),
TestFile
(
"test_fused_moe.py"
,
30
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_io_struct.py"
,
8
),
TestFile
(
"test_jinja_template_utils.py"
,
1
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_metrics_utils.py"
,
1
),
TestFile
(
"test_mla.py"
,
242
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_no_chunked_prefill.py"
,
108
),
# TestFile("test_no_overlap_scheduler.py", 234), # Disabled temporarily and track in #7703
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_page_size.py"
,
60
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
105
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_retract_decode.py"
,
54
),
TestFile
(
"test_reasoning_parser.py"
,
5
),
TestFile
(
"test_rope_rocm.py"
,
3
),
TestFile
(
"test_server_args.py"
,
1
),
TestFile
(
"test_skip_tokenizer_init.py"
,
117
),
TestFile
(
"test_srt_engine.py"
,
261
),
TestFile
(
"test_srt_endpoint.py"
,
130
),
TestFile
(
"test_torch_compile.py"
,
169
),
TestFile
(
"test_torch_compile_moe.py"
,
172
),
TestFile
(
"test_torch_native_attention_backend.py"
,
123
),
TestFile
(
"test_triton_attention_backend.py"
,
150
),
# TestFile("test_vision_chunked_prefill.py", 175), # Disabled temporarily and track in #7701
TestFile
(
"test_wave_attention_kernels.py"
,
2
),
# TestFile("test_wave_attention_backend.py", 150), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
],
"per-commit-amd-mi35x"
:
[
TestFile
(
"test_mla.py"
,
242
),
TestFile
(
"test_gpt_oss_1gpu.py"
,
600
),
],
"per-commit-2-gpu-amd"
:
[
TestFile
(
"lora/test_lora_tp.py"
,
116
),
TestFile
(
"rl/test_update_weights_from_distributed.py"
,
103
),
TestFile
(
"test_data_parallelism.py"
,
73
),
TestFile
(
"test_load_weights_from_remote_instance.py"
,
72
),
# TestFile("test_patch_torch.py", 19), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127
],
"per-commit-4-gpu-amd"
:
[
TestFile
(
"test_pp_single_node.py"
,
150
),
],
"per-commit-8-gpu-amd"
:
[
TestFile
(
"test_deepseek_v3_basic.py"
,
275
),
TestFile
(
"test_deepseek_v3_mtp.py"
,
275
),
],
"nightly-amd"
:
[
TestFile
(
"test_nightly_gsm8k_eval_amd.py"
),
],
}
# Add Intel Xeon tests
# NOTE: please sort the test cases alphabetically by the test file name
suite_xeon
=
{
"per-commit-cpu"
:
[
TestFile
(
"cpu/test_activation.py"
),
TestFile
(
"cpu/test_binding.py"
),
TestFile
(
"cpu/test_decode.py"
),
TestFile
(
"cpu/test_extend.py"
),
TestFile
(
"cpu/test_gemm.py"
),
TestFile
(
"cpu/test_mla.py"
),
TestFile
(
"cpu/test_moe.py"
),
TestFile
(
"cpu/test_norm.py"
),
TestFile
(
"cpu/test_qkv_proj_with_rope.py"
),
TestFile
(
"cpu/test_rope.py"
),
TestFile
(
"cpu/test_shared_expert.py"
),
TestFile
(
"cpu/test_topk.py"
),
TestFile
(
"test_intel_amx_attention_backend.py"
),
TestFile
(
"test_cpu_graph.py"
),
],
}
# Add Ascend NPU tests
# NOTE: please sort the test cases alphabetically by the test file name
suite_ascend
=
{
"per-commit-1-ascend-npu"
:
[
TestFile
(
"ascend/test_ascend_graph_tp1_bf16.py"
,
400
),
TestFile
(
"ascend/test_ascend_tp1_bf16.py"
,
400
),
],
"per-commit-2-ascend-npu"
:
[
TestFile
(
"ascend/test_ascend_graph_tp2_bf16.py"
,
400
),
TestFile
(
"ascend/test_ascend_mla_fia_w8a8int8.py"
,
400
),
TestFile
(
"ascend/test_ascend_tp2_bf16.py"
,
400
),
TestFile
(
"ascend/test_ascend_tp2_fia_bf16.py"
,
400
),
],
"per-commit-4-ascend-npu"
:
[
TestFile
(
"ascend/test_ascend_mla_w8a8int8.py"
,
400
),
TestFile
(
"ascend/test_ascend_tp4_bf16.py"
,
400
),
],
"per-commit-16-ascend-a3"
:
[
TestFile
(
"ascend/test_ascend_deepep.py"
,
400
),
],
}
suites
.
update
(
suite_amd
)
suites
.
update
(
suite_xeon
)
suites
.
update
(
suite_ascend
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
test/srt/run_suite.py
View file @
548a57b1
...
@@ -79,7 +79,7 @@ suites = {
...
@@ -79,7 +79,7 @@ suites = {
TestFile
(
"test_gpt_oss_1gpu.py"
,
600
),
TestFile
(
"test_gpt_oss_1gpu.py"
,
600
),
TestFile
(
"test_harmony_parser.py"
,
20
),
TestFile
(
"test_harmony_parser.py"
,
20
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_hybrid_attn_backend.py"
,
100
),
TestFile
(
"test_hybrid_attn_backend.py"
,
379
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_io_struct.py"
,
8
),
TestFile
(
"test_io_struct.py"
,
8
),
TestFile
(
"test_jinja_template_utils.py"
,
1
),
TestFile
(
"test_jinja_template_utils.py"
,
1
),
...
...
test/srt/test_mla_int8_deepseek_v3.py
View file @
548a57b1
...
@@ -22,7 +22,15 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
...
@@ -22,7 +22,15 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
other_args
=
[
"--trust-remote-code"
]
other_args
=
[
"--trust-remote-code"
]
if
torch
.
cuda
.
is_available
()
and
torch
.
version
.
cuda
:
if
torch
.
cuda
.
is_available
()
and
torch
.
version
.
cuda
:
other_args
.
extend
([
"--enable-torch-compile"
,
"--cuda-graph-max-bs"
,
"2"
])
other_args
.
extend
(
[
"--cuda-graph-max-bs"
,
"16"
,
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"2"
,
]
)
cls
.
process
=
popen_launch_server
(
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
model
,
cls
.
base_url
,
cls
.
base_url
,
...
@@ -50,6 +58,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
...
@@ -50,6 +58,7 @@ class TestMLADeepseekV3ChannelInt8(CustomTestCase):
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
0.61
)
self
.
assertGreaterEqual
(
metrics
[
"accuracy"
],
0.61
)
@
unittest
.
skipIf
(
is_in_ci
(),
"To reduce the CI execution time."
)
class
TestDeepseekV3MTPChannelInt8
(
CustomTestCase
):
class
TestDeepseekV3MTPChannelInt8
(
CustomTestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -60,12 +69,10 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
...
@@ -60,12 +69,10 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
other_args
.
extend
(
other_args
.
extend
(
[
[
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"2"
,
"16"
,
"--disable-radix"
,
"--enable-torch-compile"
,
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"--torch-compile-max-bs"
,
"1"
,
"2"
"--speculative-algorithm"
,
"--speculative-algorithm"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft-model-path"
,
"--speculative-draft-model-path"
,
"sgl-project/sglang-ci-dsv3-channel-int8-test-NextN"
,
"sgl-project/sglang-ci-dsv3-channel-int8-test-NextN"
,
...
@@ -121,7 +128,15 @@ class TestMLADeepseekV3BlockInt8(CustomTestCase):
...
@@ -121,7 +128,15 @@ class TestMLADeepseekV3BlockInt8(CustomTestCase):
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
other_args
=
[
"--trust-remote-code"
]
other_args
=
[
"--trust-remote-code"
]
if
torch
.
cuda
.
is_available
()
and
torch
.
version
.
cuda
:
if
torch
.
cuda
.
is_available
()
and
torch
.
version
.
cuda
:
other_args
.
extend
([
"--enable-torch-compile"
,
"--cuda-graph-max-bs"
,
"2"
])
other_args
.
extend
(
[
"--cuda-graph-max-bs"
,
"16"
,
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"2"
,
]
)
cls
.
process
=
popen_launch_server
(
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
model
,
cls
.
base_url
,
cls
.
base_url
,
...
@@ -159,12 +174,10 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase):
...
@@ -159,12 +174,10 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase):
other_args
.
extend
(
other_args
.
extend
(
[
[
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"2"
,
"16"
,
"--disable-radix"
,
"--enable-torch-compile"
,
"--enable-torch-compile"
,
"--torch-compile-max-bs"
,
"--torch-compile-max-bs"
,
"1"
,
"2"
"--speculative-algorithm"
,
"--speculative-algorithm"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-num-steps"
,
"--speculative-num-steps"
,
"2"
,
"2"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment