Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f7fb68d2
Unverified
Commit
f7fb68d2
authored
Aug 13, 2024
by
Yineng Zhang
Committed by
GitHub
Aug 13, 2024
Browse files
ci: add moe test (#1053)
parent
396a13e6
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
195 additions
and
44 deletions
+195
-44
.github/workflows/moe-test.yml
.github/workflows/moe-test.yml
+42
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+5
-1
test/srt/test_chunked_prefill.py
test/srt/test_chunked_prefill.py
+2
-3
test/srt/test_embedding_openai_server.py
test/srt/test_embedding_openai_server.py
+2
-3
test/srt/test_eval_accuracy_large.py
test/srt/test_eval_accuracy_large.py
+5
-5
test/srt/test_eval_accuracy_large_chunked_prefill.py
test/srt/test_eval_accuracy_large_chunked_prefill.py
+5
-5
test/srt/test_eval_accuracy_mini.py
test/srt/test_eval_accuracy_mini.py
+2
-3
test/srt/test_large_max_new_tokens.py
test/srt/test_large_max_new_tokens.py
+2
-3
test/srt/test_moe_serving_throughput.py
test/srt/test_moe_serving_throughput.py
+112
-0
test/srt/test_openai_server.py
test/srt/test_openai_server.py
+2
-3
test/srt/test_serving_throughput.py
test/srt/test_serving_throughput.py
+6
-3
test/srt/test_skip_tokenizer_init.py
test/srt/test_skip_tokenizer_init.py
+2
-3
test/srt/test_srt_endpoint.py
test/srt/test_srt_endpoint.py
+2
-3
test/srt/test_torch_compile.py
test/srt/test_torch_compile.py
+2
-3
test/srt/test_triton_attn_backend.py
test/srt/test_triton_attn_backend.py
+2
-3
test/srt/test_vision_openai_server.py
test/srt/test_vision_openai_server.py
+2
-3
No files found.
.github/workflows/moe-test.yml
0 → 100644
View file @
f7fb68d2
name
:
MoE Test
on
:
push
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
pull_request
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
workflow_dispatch
:
concurrency
:
group
:
moe-test-${{ github.ref }}
cancel-in-progress
:
true
jobs
:
moe-test
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
accuracy
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark MOE Serving Throughput
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
python/sglang/test/test_utils.py
View file @
f7fb68d2
...
...
@@ -21,7 +21,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from
sglang.utils
import
get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_URL_FOR_TEST
=
"http://127.0.0.1:8157"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_URL_FOR_MOE_TEST
=
"http://127.0.0.1:6157"
DEFAULT_URL_FOR_ACCURACY_TEST
=
"http://127.0.0.1:7157"
DEFAULT_URL_FOR_UNIT_TEST
=
"http://127.0.0.1:8157"
DEFAULT_URL_FOR_E2E_TEST
=
"http://127.0.0.1:9157"
def
call_generate_lightllm
(
prompt
,
temperature
,
max_tokens
,
stop
=
None
,
url
=
None
):
...
...
test/srt/test_chunked_prefill.py
View file @
f7fb68d2
...
...
@@ -5,20 +5,19 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestChunkedPrefill
(
unittest
.
TestCase
):
def
run_mmlu
(
self
,
disable_radix_cache
):
other_args
=
[
"--chunked-prefill-size"
,
"32"
]
if
disable_radix_cache
:
other_args
+=
[
"--disable-radix-cache"
]
model
=
DEFAULT_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
process
=
popen_launch_server
(
model
,
base_url
,
...
...
test/srt/test_embedding_openai_server.py
View file @
f7fb68d2
...
...
@@ -4,15 +4,14 @@ import openai
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_URL_FOR_TEST
,
popen_launch_server
from
sglang.test.test_utils
import
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
class
TestOpenAIServer
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
"intfloat/e5-mistral-7b-instruct"
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
,
api_key
=
cls
.
api_key
...
...
test/srt/test_eval_accuracy_large.py
View file @
f7fb68d2
...
...
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_ACCURACY_TEST
,
DEFAULT_URL_FOR_UNIT_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
"http://127.0.0.1:7157"
cls
.
base_url
=
DEFAULT_URL_FOR_ACCURACY_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
...
...
@@ -49,7 +49,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.6
5
,
f
"
{
metrics
}
"
assert
metrics
[
"score"
]
>=
0.6
4
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
...
...
@@ -61,7 +61,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.8
5
,
f
"
{
metrics
}
"
assert
metrics
[
"score"
]
>=
0.8
4
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
...
...
test/srt/test_eval_accuracy_large_chunked_prefill.py
View file @
f7fb68d2
...
...
@@ -5,17 +5,17 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_ACCURACY_TEST
,
DEFAULT_URL_FOR_UNIT_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLargeChunkedPrefill
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
"http://127.0.0.1:7157"
cls
.
base_url
=
DEFAULT_URL_FOR_ACCURACY_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
...
...
@@ -49,7 +49,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.6
5
,
f
"
{
metrics
}
"
assert
metrics
[
"score"
]
>=
0.6
4
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
...
...
@@ -61,7 +61,7 @@ class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.8
5
,
f
"
{
metrics
}
"
assert
metrics
[
"score"
]
>=
0.8
4
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
...
...
test/srt/test_eval_accuracy_mini.py
View file @
f7fb68d2
...
...
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyMini
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
)
@
classmethod
...
...
test/srt/test_large_max_new_tokens.py
View file @
f7fb68d2
...
...
@@ -10,17 +10,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestOpenAIServer
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
...
...
test/srt/test_moe_serving_throughput.py
0 → 100644
View file @
f7fb68d2
import
os
import
unittest
from
types
import
SimpleNamespace
from
sglang.bench_serving
import
run_benchmark
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_MOE_TEST
,
popen_launch_server
,
)
class
TestServingThroughput
(
unittest
.
TestCase
):
def
run_test
(
self
,
disable_radix_cache
,
disable_flashinfer
,
chunked_prefill_size
):
# Launch the server
other_args
=
[]
if
disable_radix_cache
:
other_args
.
append
(
"--disable-radix-cache"
)
if
disable_flashinfer
:
other_args
.
append
(
"--disable-flashinfer"
)
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
other_args
.
extend
([
"--tensor-parallel-size"
,
"2"
])
other_args
.
append
(
"--enable-p2p-check"
)
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_MOE_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
300
,
other_args
=
other_args
)
# Run benchmark
num_prompts
=
400
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
host
=
None
,
port
=
None
,
dataset_name
=
"random"
,
dataset_path
=
""
,
model
=
None
,
tokenizer
=
None
,
num_prompts
=
num_prompts
,
sharegpt_output_len
=
None
,
random_input_len
=
4096
,
random_output_len
=
2048
,
random_range_ratio
=
0.0
,
request_rate
=
float
(
"inf"
),
multi
=
None
,
seed
=
0
,
output_file
=
None
,
disable_tqdm
=
False
,
disable_stream
=
False
,
disable_ignore_eos
=
False
,
extra_request_body
=
None
,
)
try
:
res
=
run_benchmark
(
args
)
finally
:
kill_child_process
(
process
.
pid
)
assert
res
[
"completed"
]
==
num_prompts
return
res
def
test_default
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
disable_flashinfer
=
ServerArgs
.
disable_flashinfer
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE) performance
assert
res
[
"output_throughput"
]
>
950
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
True
,
disable_flashinfer
=
ServerArgs
.
disable_flashinfer
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE) performance
assert
res
[
"output_throughput"
]
>
950
def
test_default_with_chunked_prefill
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
disable_flashinfer
=
ServerArgs
.
disable_flashinfer
,
chunked_prefill_size
=
8192
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE) performance
print
(
res
[
"output_throughput"
])
def
test_all_cases
(
self
):
for
disable_radix_cache
in
[
False
,
True
]:
for
disable_flashinfer
in
[
False
,
True
]:
for
chunked_prefill_size
in
[
-
1
,
2048
]:
self
.
run_test
(
disable_radix_cache
=
False
,
disable_flashinfer
=
False
,
chunked_prefill_size
=-
1
,
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_openai_server.py
View file @
f7fb68d2
...
...
@@ -8,17 +8,16 @@ from sglang.srt.hf_transformers_utils import get_tokenizer
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestOpenAIServer
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
,
api_key
=
cls
.
api_key
...
...
test/srt/test_serving_throughput.py
View file @
f7fb68d2
...
...
@@ -5,11 +5,14 @@ from types import SimpleNamespace
from
sglang.bench_serving
import
run_benchmark
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MODEL_NAME_FOR_TEST
,
popen_launch_server
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_E2E_TEST
,
popen_launch_server
,
)
class
TestServingThroughput
(
unittest
.
TestCase
):
def
run_test
(
self
,
disable_radix_cache
,
disable_flashinfer
,
chunked_prefill_size
):
# Launch the server
other_args
=
[]
...
...
@@ -20,7 +23,7 @@ class TestServingThroughput(unittest.TestCase):
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
model
=
DEFAULT_MODEL_NAME_FOR_TEST
base_url
=
"http://127.0.0.1:9157"
base_url
=
DEFAULT_URL_FOR_E2E_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
300
,
other_args
=
other_args
)
...
...
test/srt/test_skip_tokenizer_init.py
View file @
f7fb68d2
...
...
@@ -6,17 +6,16 @@ import requests
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestSkipTokenizerInit
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
,
other_args
=
[
"--skip-tokenizer-init"
]
)
...
...
test/srt/test_srt_endpoint.py
View file @
f7fb68d2
...
...
@@ -6,17 +6,16 @@ import requests
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestSRTEndpoint
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
)
@
classmethod
...
...
test/srt/test_torch_compile.py
View file @
f7fb68d2
...
...
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestTorchCompile
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
,
other_args
=
[
"--enable-torch-compile"
]
)
...
...
test/srt/test_triton_attn_backend.py
View file @
f7fb68d2
...
...
@@ -5,17 +5,16 @@ from sglang.srt.utils import kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
,
)
class
TestTritonAttnBackend
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
300
,
other_args
=
[
"--disable-flashinfer"
]
)
...
...
test/srt/test_vision_openai_server.py
View file @
f7fb68d2
...
...
@@ -5,15 +5,14 @@ import openai
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_URL_FOR_TEST
,
popen_launch_server
from
sglang.test.test_utils
import
DEFAULT_URL_FOR_
UNIT_
TEST
,
popen_launch_server
class
TestOpenAIVisionServer
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
"liuhaotian/llava-v1.6-vicuna-7b"
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_
UNIT_
TEST
cls
.
api_key
=
"sk-123456"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment