Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
aa957102
"app/vscode:/vscode.git/clone" did not exist on "90d4ea9d71e82a54723ce969126e68acf5303d8e"
Unverified
Commit
aa957102
authored
Mar 10, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 10, 2025
Browse files
Simplify tests & Fix trtllm custom allreduce registration (#4252)
parent
007f8b3d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
30 additions
and
211 deletions
+30
-211
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+1
-1
python/sglang/srt/_custom_ops.py
python/sglang/srt/_custom_ops.py
+4
-2
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+1
-1
python/sglang/srt/model_loader/loader.py
python/sglang/srt/model_loader/loader.py
+2
-1
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
+3
-2
sgl-kernel/csrc/torch_extension.cc
sgl-kernel/csrc/torch_extension.cc
+2
-5
sgl-kernel/tests/test_rotary_embedding.py
sgl-kernel/tests/test_rotary_embedding.py
+0
-2
test/srt/run_suite.py
test/srt/run_suite.py
+10
-12
test/srt/test_bench_one_batch.py
test/srt/test_bench_one_batch.py
+6
-1
test/srt/test_eval_accuracy_large_chunked_prefill.py
test/srt/test_eval_accuracy_large_chunked_prefill.py
+0
-68
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
+0
-74
test/srt/test_eval_accuracy_mini.py
test/srt/test_eval_accuracy_mini.py
+0
-42
test/srt/test_gptqmodel_dynamic.py
test/srt/test_gptqmodel_dynamic.py
+1
-0
No files found.
.github/workflows/pr-test.yml
View file @
aa957102
...
...
@@ -266,7 +266,7 @@ jobs:
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
#
USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-
name
:
Benchmark single latency + torch.compile (TP=2)
timeout-minutes
:
10
...
...
python/sglang/srt/_custom_ops.py
View file @
aa957102
...
...
@@ -6,10 +6,12 @@ from typing import List, Tuple
import
torch
import
torch.library
from
sglang.srt.utils
import
is_hip
,
is_hpu
from
sglang.srt.utils
import
get_bool_env_var
,
is_hip
,
is_hpu
logger
=
logging
.
getLogger
(
__name__
)
use_vllm_custom_allreduce
=
os
.
environ
.
get
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
True
)
use_vllm_custom_allreduce
=
get_bool_env_var
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
"true"
)
if
not
is_hpu
():
# ROCm does not use vllm custom allreduce
...
...
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
aa957102
...
...
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.utils
import
is_flashinfer_available
from
sglang.srt.utils
import
get_bool_env_var
,
is_flashinfer_available
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
python/sglang/srt/model_loader/loader.py
View file @
aa957102
...
...
@@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import (
safetensors_weights_iterator
,
)
from
sglang.srt.utils
import
(
get_bool_env_var
,
get_device_capability
,
is_pin_memory_available
,
set_weight_attrs
,
...
...
@@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader):
Returns the path to the downloaded model, or None if the model is not
downloaded from ModelScope."""
if
os
.
environ
.
get
(
"SGLANG_USE_MODELSCOPE"
,
None
)
==
"True"
:
if
get_bool_env_var
(
"SGLANG_USE_MODELSCOPE"
)
:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
...
...
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
View file @
aa957102
...
...
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
check_device_dtype
(
out_dtype
,
inputs
);
check_device_dtype
(
out_dtype
,
weights
);
check_device_dtype
(
out_dtype
,
outputs
);
cudaDataType_t
cuda_data_type
=
(
out_dtype
==
torch
::
kHalf
?
CUDA_R_16F
:
CUDA_R_16BF
);
// Weights should be transposed to (n, k) of column major
std
::
vector
<
cublasOperation_t
>
transa_array
(
group_count
,
CUBLAS_OP_T
);
...
...
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
std
::
vector
<
void
*>
b_array
=
get_tensor_ptrs
(
inputs
);
std
::
vector
<
void
*>
c_array
=
get_tensor_ptrs
(
outputs
);
auto
handle
=
reinterpret_cast
<
cublasHandle_t
>
(
cublas_handle
);
auto
stream
=
reinterpret_cast
<
cudaStream_t
>
(
cuda_stream
);
// Should allocate tensors for storage of pointers
...
...
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
torch
::
Tensor
d_c
=
create_ptr_pointer
(
c_array
,
stream
);
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
auto
handle
=
reinterpret_cast
<
cublasHandle_t
>
(
cublas_handle
);
cudaDataType_t
cuda_data_type
=
(
out_dtype
==
torch
::
kHalf
?
CUDA_R_16F
:
CUDA_R_16BF
);
auto
status
=
cublasGemmGroupedBatchedEx
(
handle
,
transa_array
.
data
(),
...
...
sgl-kernel/csrc/torch_extension.cc
View file @
aa957102
...
...
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
m
.
def
(
"all_reduce(int fa, Tensor inp, Tensor! out) -> ()"
);
m
.
impl
(
"all_reduce"
,
torch
::
kCUDA
,
&
all_reduce
);
m
.
def
(
"get_graph_buffer_ipc_meta(int fa) -> (int[], int[])"
);
m
.
impl
(
"get_graph_buffer_ipc_meta"
,
torch
::
kCUDA
,
&
get_graph_buffer_ipc_meta
);
m
.
def
(
"register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()"
);
m
.
impl
(
"register_graph_buffers"
,
torch
::
kCUDA
,
&
register_graph_buffers
);
m
.
def
(
"get_graph_buffer_ipc_meta"
,
&
get_graph_buffer_ipc_meta
);
m
.
def
(
"register_graph_buffers"
,
&
register_graph_buffers
);
/*
* From csrc/attention
...
...
sgl-kernel/tests/test_rotary_embedding.py
View file @
aa957102
import
math
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
import
pytest
import
torch
import
torch.nn
as
nn
from
sgl_kernel
import
apply_rope_with_cos_sin_cache_inplace
...
...
test/srt/run_suite.py
View file @
aa957102
...
...
@@ -20,30 +20,33 @@ suites = {
TestFile
(
"models/test_generation_models.py"
,
103
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
72
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_custom_allreduce.py"
,
1
),
TestFile
(
"test_double_sparsity.py"
,
50
),
TestFile
(
"test_eagle_infer.py"
,
447
),
TestFile
(
"test_ebnf_constrained.py"
),
TestFile
(
"test_fp8_kernel.py"
,
2
),
TestFile
(
"test_embedding_openai_server.py"
,
36
),
TestFile
(
"test_eval_accuracy_mini.py"
,
63
),
TestFile
(
"test_gguf.py"
,
78
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
72
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_int8_kernel.py"
,
1
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_mla.py"
,
92
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_mla_flashinfer.py"
,
395
),
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_openai_server.py"
,
124
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
167
),
TestFile
(
"test_reasoning_content.py"
,
89
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_release_memory_occupation.py"
,
44
),
TestFile
(
"test_request_length_validation.py"
,
31
),
...
...
@@ -58,7 +61,6 @@ suites = {
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
TestFile
(
"test_triton_attention_backend.py"
,
134
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_update_weights_from_disk.py"
,
114
),
TestFile
(
"test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_vertex_endpoint.py"
,
31
),
...
...
@@ -66,10 +68,6 @@ suites = {
TestFile
(
"test_vision_llm.py"
,
18.4
),
TestFile
(
"test_vision_openai_server.py"
,
344
),
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"test_fp8_kernel.py"
,
2
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_int8_kernel.py"
,
1
),
TestFile
(
"test_reasoning_content.py"
,
89
),
],
"nightly"
:
[
TestFile
(
"test_nightly_gsm8k_eval.py"
),
...
...
test/srt/test_bench_one_batch.py
View file @
aa957102
...
...
@@ -3,6 +3,7 @@ import unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
get_bool_env_var
,
is_in_ci
,
run_bench_one_batch
,
write_github_step_summary
,
...
...
@@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase):
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
[
"--tp"
,
"2"
,
"--cuda-graph-max-bs"
,
"2"
]
)
use_vllm_custom_allreduce
=
get_bool_env_var
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
"true"
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_tp2_bs1
\n
"
f
"### test_moe_tp2_bs1
(
{
use_vllm_custom_allreduce
=
}
)
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
124
)
...
...
test/srt/test_eval_accuracy_large_chunked_prefill.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLargeChunkedPrefill
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--chunked-prefill-size"
,
"256"
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.705
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.84
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLargeChunkedPrefill
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--chunked-prefill-size"
,
"256"
,
"--enable-mixed-chunk"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.705
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.84
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_eval_accuracy_mini.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyMini
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
temperature
=
0.1
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"score"
],
0.65
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_gptqmodel_dynamic.py
View file @
aa957102
...
...
@@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
"text"
:
"The capital of France is"
,
"sampling_params"
:
{
"max_new_tokens"
:
max_new_tokens
,
"temperature"
:
0.001
,
},
},
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment