Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
aa957102
Unverified
Commit
aa957102
authored
Mar 10, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 10, 2025
Browse files
Simplify tests & Fix trtllm custom allreduce registration (#4252)
parent
007f8b3d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
30 additions
and
211 deletions
+30
-211
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+1
-1
python/sglang/srt/_custom_ops.py
python/sglang/srt/_custom_ops.py
+4
-2
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+1
-1
python/sglang/srt/model_loader/loader.py
python/sglang/srt/model_loader/loader.py
+2
-1
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
+3
-2
sgl-kernel/csrc/torch_extension.cc
sgl-kernel/csrc/torch_extension.cc
+2
-5
sgl-kernel/tests/test_rotary_embedding.py
sgl-kernel/tests/test_rotary_embedding.py
+0
-2
test/srt/run_suite.py
test/srt/run_suite.py
+10
-12
test/srt/test_bench_one_batch.py
test/srt/test_bench_one_batch.py
+6
-1
test/srt/test_eval_accuracy_large_chunked_prefill.py
test/srt/test_eval_accuracy_large_chunked_prefill.py
+0
-68
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
+0
-74
test/srt/test_eval_accuracy_mini.py
test/srt/test_eval_accuracy_mini.py
+0
-42
test/srt/test_gptqmodel_dynamic.py
test/srt/test_gptqmodel_dynamic.py
+1
-0
No files found.
.github/workflows/pr-test.yml
View file @
aa957102
...
@@ -266,7 +266,7 @@ jobs:
...
@@ -266,7 +266,7 @@ jobs:
cd test/srt
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
#
USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-
name
:
Benchmark single latency + torch.compile (TP=2)
-
name
:
Benchmark single latency + torch.compile (TP=2)
timeout-minutes
:
10
timeout-minutes
:
10
...
...
python/sglang/srt/_custom_ops.py
View file @
aa957102
...
@@ -6,10 +6,12 @@ from typing import List, Tuple
...
@@ -6,10 +6,12 @@ from typing import List, Tuple
import
torch
import
torch
import
torch.library
import
torch.library
from
sglang.srt.utils
import
is_hip
,
is_hpu
from
sglang.srt.utils
import
get_bool_env_var
,
is_hip
,
is_hpu
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
use_vllm_custom_allreduce
=
os
.
environ
.
get
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
True
)
use_vllm_custom_allreduce
=
get_bool_env_var
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
"true"
)
if
not
is_hpu
():
if
not
is_hpu
():
# ROCm does not use vllm custom allreduce
# ROCm does not use vllm custom allreduce
...
...
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
aa957102
...
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
...
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.utils
import
is_flashinfer_available
from
sglang.srt.utils
import
get_bool_env_var
,
is_flashinfer_available
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
python/sglang/srt/model_loader/loader.py
View file @
aa957102
...
@@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import (
...
@@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import (
safetensors_weights_iterator
,
safetensors_weights_iterator
,
)
)
from
sglang.srt.utils
import
(
from
sglang.srt.utils
import
(
get_bool_env_var
,
get_device_capability
,
get_device_capability
,
is_pin_memory_available
,
is_pin_memory_available
,
set_weight_attrs
,
set_weight_attrs
,
...
@@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader):
...
@@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader):
Returns the path to the downloaded model, or None if the model is not
Returns the path to the downloaded model, or None if the model is not
downloaded from ModelScope."""
downloaded from ModelScope."""
if
os
.
environ
.
get
(
"SGLANG_USE_MODELSCOPE"
,
None
)
==
"True"
:
if
get_bool_env_var
(
"SGLANG_USE_MODELSCOPE"
)
:
# download model from ModelScope hub,
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
# pylint: disable=C.
...
...
sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
View file @
aa957102
...
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
...
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
check_device_dtype
(
out_dtype
,
inputs
);
check_device_dtype
(
out_dtype
,
inputs
);
check_device_dtype
(
out_dtype
,
weights
);
check_device_dtype
(
out_dtype
,
weights
);
check_device_dtype
(
out_dtype
,
outputs
);
check_device_dtype
(
out_dtype
,
outputs
);
cudaDataType_t
cuda_data_type
=
(
out_dtype
==
torch
::
kHalf
?
CUDA_R_16F
:
CUDA_R_16BF
);
// Weights should be transposed to (n, k) of column major
// Weights should be transposed to (n, k) of column major
std
::
vector
<
cublasOperation_t
>
transa_array
(
group_count
,
CUBLAS_OP_T
);
std
::
vector
<
cublasOperation_t
>
transa_array
(
group_count
,
CUBLAS_OP_T
);
...
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
...
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
std
::
vector
<
void
*>
b_array
=
get_tensor_ptrs
(
inputs
);
std
::
vector
<
void
*>
b_array
=
get_tensor_ptrs
(
inputs
);
std
::
vector
<
void
*>
c_array
=
get_tensor_ptrs
(
outputs
);
std
::
vector
<
void
*>
c_array
=
get_tensor_ptrs
(
outputs
);
auto
handle
=
reinterpret_cast
<
cublasHandle_t
>
(
cublas_handle
);
auto
stream
=
reinterpret_cast
<
cudaStream_t
>
(
cuda_stream
);
auto
stream
=
reinterpret_cast
<
cudaStream_t
>
(
cuda_stream
);
// Should allocate tensors for storage of pointers
// Should allocate tensors for storage of pointers
...
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
...
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
torch
::
Tensor
d_c
=
create_ptr_pointer
(
c_array
,
stream
);
torch
::
Tensor
d_c
=
create_ptr_pointer
(
c_array
,
stream
);
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
#if defined CUDA_VERSION && CUDA_VERSION >= 12050
auto
handle
=
reinterpret_cast
<
cublasHandle_t
>
(
cublas_handle
);
cudaDataType_t
cuda_data_type
=
(
out_dtype
==
torch
::
kHalf
?
CUDA_R_16F
:
CUDA_R_16BF
);
auto
status
=
cublasGemmGroupedBatchedEx
(
auto
status
=
cublasGemmGroupedBatchedEx
(
handle
,
handle
,
transa_array
.
data
(),
transa_array
.
data
(),
...
...
sgl-kernel/csrc/torch_extension.cc
View file @
aa957102
...
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
...
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
m
.
def
(
"all_reduce(int fa, Tensor inp, Tensor! out) -> ()"
);
m
.
def
(
"all_reduce(int fa, Tensor inp, Tensor! out) -> ()"
);
m
.
impl
(
"all_reduce"
,
torch
::
kCUDA
,
&
all_reduce
);
m
.
impl
(
"all_reduce"
,
torch
::
kCUDA
,
&
all_reduce
);
m
.
def
(
"get_graph_buffer_ipc_meta(int fa) -> (int[], int[])"
);
m
.
def
(
"get_graph_buffer_ipc_meta"
,
&
get_graph_buffer_ipc_meta
);
m
.
impl
(
"get_graph_buffer_ipc_meta"
,
torch
::
kCUDA
,
&
get_graph_buffer_ipc_meta
);
m
.
def
(
"register_graph_buffers"
,
&
register_graph_buffers
);
m
.
def
(
"register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()"
);
m
.
impl
(
"register_graph_buffers"
,
torch
::
kCUDA
,
&
register_graph_buffers
);
/*
/*
* From csrc/attention
* From csrc/attention
...
...
sgl-kernel/tests/test_rotary_embedding.py
View file @
aa957102
import
math
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
,
Union
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn
as
nn
from
sgl_kernel
import
apply_rope_with_cos_sin_cache_inplace
from
sgl_kernel
import
apply_rope_with_cos_sin_cache_inplace
...
...
test/srt/run_suite.py
View file @
aa957102
...
@@ -20,30 +20,33 @@ suites = {
...
@@ -20,30 +20,33 @@ suites = {
TestFile
(
"models/test_generation_models.py"
,
103
),
TestFile
(
"models/test_generation_models.py"
,
103
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_qwen_models.py"
,
82
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
72
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_custom_allreduce.py"
,
1
),
TestFile
(
"test_double_sparsity.py"
,
50
),
TestFile
(
"test_eagle_infer.py"
,
447
),
TestFile
(
"test_eagle_infer.py"
,
447
),
TestFile
(
"test_ebnf_constrained.py"
),
TestFile
(
"test_fp8_kernel.py"
,
2
),
TestFile
(
"test_embedding_openai_server.py"
,
36
),
TestFile
(
"test_embedding_openai_server.py"
,
36
),
TestFile
(
"test_eval_accuracy_mini.py"
,
63
),
TestFile
(
"test_gguf.py"
,
78
),
TestFile
(
"test_gguf.py"
,
78
),
TestFile
(
"test_gptqmodel_dynamic.py"
,
72
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_int8_kernel.py"
,
1
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_input_embeddings.py"
,
38
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_mla.py"
,
92
),
TestFile
(
"test_mla.py"
,
92
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_mla_deepseek_v3.py"
,
221
),
TestFile
(
"test_mla_flashinfer.py"
,
395
),
TestFile
(
"test_mla_flashinfer.py"
,
395
),
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_mla_fp8.py"
,
93
),
TestFile
(
"test_json_constrained.py"
,
98
),
TestFile
(
"test_large_max_new_tokens.py"
,
41
),
TestFile
(
"test_metrics.py"
,
32
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_chunked_prefill.py"
,
126
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_no_overlap_scheduler.py"
,
262
),
TestFile
(
"test_openai_server.py"
,
124
),
TestFile
(
"test_openai_server.py"
,
124
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_penalty.py"
,
41
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
167
),
TestFile
(
"test_radix_attention.py"
,
167
),
TestFile
(
"test_reasoning_content.py"
,
89
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_release_memory_occupation.py"
,
44
),
TestFile
(
"test_release_memory_occupation.py"
,
44
),
TestFile
(
"test_request_length_validation.py"
,
31
),
TestFile
(
"test_request_length_validation.py"
,
31
),
...
@@ -58,7 +61,6 @@ suites = {
...
@@ -58,7 +61,6 @@ suites = {
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_torchao.py"
,
70
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
TestFile
(
"test_triton_attention_kernels.py"
,
4
),
TestFile
(
"test_triton_attention_backend.py"
,
134
),
TestFile
(
"test_triton_attention_backend.py"
,
134
),
TestFile
(
"test_hidden_states.py"
,
55
),
TestFile
(
"test_update_weights_from_disk.py"
,
114
),
TestFile
(
"test_update_weights_from_disk.py"
,
114
),
TestFile
(
"test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_update_weights_from_tensor.py"
,
48
),
TestFile
(
"test_vertex_endpoint.py"
,
31
),
TestFile
(
"test_vertex_endpoint.py"
,
31
),
...
@@ -66,10 +68,6 @@ suites = {
...
@@ -66,10 +68,6 @@ suites = {
TestFile
(
"test_vision_llm.py"
,
18.4
),
TestFile
(
"test_vision_llm.py"
,
18.4
),
TestFile
(
"test_vision_openai_server.py"
,
344
),
TestFile
(
"test_vision_openai_server.py"
,
344
),
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"test_fp8_kernel.py"
,
2
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_int8_kernel.py"
,
1
),
TestFile
(
"test_reasoning_content.py"
,
89
),
],
],
"nightly"
:
[
"nightly"
:
[
TestFile
(
"test_nightly_gsm8k_eval.py"
),
TestFile
(
"test_nightly_gsm8k_eval.py"
),
...
...
test/srt/test_bench_one_batch.py
View file @
aa957102
...
@@ -3,6 +3,7 @@ import unittest
...
@@ -3,6 +3,7 @@ import unittest
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
get_bool_env_var
,
is_in_ci
,
is_in_ci
,
run_bench_one_batch
,
run_bench_one_batch
,
write_github_step_summary
,
write_github_step_summary
,
...
@@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase):
...
@@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase):
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
[
"--tp"
,
"2"
,
"--cuda-graph-max-bs"
,
"2"
]
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
[
"--tp"
,
"2"
,
"--cuda-graph-max-bs"
,
"2"
]
)
)
use_vllm_custom_allreduce
=
get_bool_env_var
(
"USE_VLLM_CUSTOM_ALLREDUCE"
,
default
=
"true"
)
if
is_in_ci
():
if
is_in_ci
():
write_github_step_summary
(
write_github_step_summary
(
f
"### test_moe_tp2_bs1
\n
"
f
"### test_moe_tp2_bs1
(
{
use_vllm_custom_allreduce
=
}
)
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
)
)
self
.
assertGreater
(
output_throughput
,
124
)
self
.
assertGreater
(
output_throughput
,
124
)
...
...
test/srt/test_eval_accuracy_large_chunked_prefill.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLargeChunkedPrefill
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--chunked-prefill-size"
,
"256"
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.705
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.84
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLargeChunkedPrefill
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--chunked-prefill-size"
,
"256"
,
"--enable-mixed-chunk"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.705
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.84
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_eval_accuracy_mini.py
deleted
100644 → 0
View file @
007f8b3d
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyMini
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
64
,
num_threads
=
32
,
temperature
=
0.1
,
)
metrics
=
run_eval
(
args
)
self
.
assertGreaterEqual
(
metrics
[
"score"
],
0.65
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_gptqmodel_dynamic.py
View file @
aa957102
...
@@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
...
@@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
"text"
:
"The capital of France is"
,
"text"
:
"The capital of France is"
,
"sampling_params"
:
{
"sampling_params"
:
{
"max_new_tokens"
:
max_new_tokens
,
"max_new_tokens"
:
max_new_tokens
,
"temperature"
:
0.001
,
},
},
},
},
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment