Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
1b5d56f7
Unverified
Commit
1b5d56f7
authored
Sep 01, 2024
by
Lianmin Zheng
Committed by
GitHub
Sep 01, 2024
Browse files
[CI] Add more multi-gpu tests (#1280)
parent
d134c139
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
271 additions
and
128 deletions
+271
-128
.github/workflows/accuracy-test.yml
.github/workflows/accuracy-test.yml
+32
-1
.github/workflows/cache-purge.yml
.github/workflows/cache-purge.yml
+0
-27
.github/workflows/e2e-test.yml
.github/workflows/e2e-test.yml
+42
-2
.github/workflows/moe-test.yml
.github/workflows/moe-test.yml
+0
-45
.github/workflows/unit-test.yml
.github/workflows/unit-test.yml
+5
-5
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+25
-16
test/srt/test_moe_eval_accuracy_large.py
test/srt/test_moe_eval_accuracy_large.py
+73
-0
test/srt/test_moe_serving_latency.py
test/srt/test_moe_serving_latency.py
+45
-0
test/srt/test_moe_serving_throughput.py
test/srt/test_moe_serving_throughput.py
+3
-16
test/srt/test_serving_latency.py
test/srt/test_serving_latency.py
+43
-0
test/srt/test_serving_throughput.py
test/srt/test_serving_throughput.py
+3
-16
No files found.
.github/workflows/accuracy-test.yml
View file @
1b5d56f7
...
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
jobs
:
accuracy-test
:
one-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
...
...
@@ -41,3 +41,34 @@ jobs:
run
:
|
cd test/srt
python3 test_eval_accuracy_large.py
two-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
-
name
:
Evaluate Accuracy
timeout-minutes
:
20
run
:
|
cd test/srt
python3 test_moe_eval_accuracy_large.py
finish
:
needs
:
[
one-gpu
,
two-gpu
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
.github/workflows/cache-purge.yml
deleted
100644 → 0
View file @
d134c139
name
:
Weekly Cache Purge
on
:
schedule
:
-
cron
:
'
0
0
*
*
0'
# Every Sunday at 00:00
workflow_dispatch
:
jobs
:
purge-cache
:
if
:
github.repository == 'sgl-project/sglang'
runs-on
:
self-hosted
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Purge pip cache
run
:
|
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip cache purge
-
name
:
Update dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
.github/workflows/e2e-test.yml
View file @
1b5d56f7
...
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
jobs
:
e2e-test
:
one-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
...
...
@@ -41,7 +41,8 @@ jobs:
-
name
:
Benchmark Serving Latency
timeout-minutes
:
10
run
:
|
python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
-
name
:
Benchmark Serving Throughput (w/o RadixAttention)
timeout-minutes
:
10
...
...
@@ -54,3 +55,42 @@ jobs:
run
:
|
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
two-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark Serving Throughput (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
name
:
Benchmark Serving Latency (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
-
name
:
Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
finish
:
needs
:
[
one-gpu
,
two-gpu
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
.github/workflows/moe-test.yml
deleted
100644 → 0
View file @
d134c139
name
:
MoE Test
on
:
push
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
pull_request
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
workflow_dispatch
:
concurrency
:
group
:
moe-test-${{ github.ref }}
cancel-in-progress
:
true
jobs
:
moe-test
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark MoE Serving Throughput
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
name
:
Benchmark MoE Serving Throughput (w/o RadixAttention)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
.github/workflows/unit-test.yml
View file @
1b5d56f7
...
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
jobs
:
un
it
-test
-jobs
:
r
un-test
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
strategy
:
...
...
@@ -48,9 +48,9 @@ jobs:
python3 run_suite.py --suite minimal --range-begin 8
fi
unit-test
:
needs
:
un
it
-test
-jobs
finish
:
needs
:
[
r
un-test
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Merge step
run
:
echo "This is an empty merge step"
\ No newline at end of file
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
python/sglang/bench_latency.py
View file @
1b5d56f7
...
...
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
## plot the results in series of lines:
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
# Usage (correctness test):
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
## Reference output (of the correctness test above, can be gpu dependent):
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
device='cuda:0', dtype=torch.float16)
prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141],
[-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742],
[-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
device='cuda:0', dtype=torch.float16)
<s> The capital of France is.
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
[ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
device='cuda:0')
prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
[-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
[-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
device='cuda:0')
========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C.
<s> The capital of the United Kindom is.
========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London.
The capital of the
<s> Today is a sunny day and I like go for a walk in the park.
========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park
"""
...
...
@@ -225,12 +233,12 @@ def correctness_test(
# Prepare inputs
input_ids
,
reqs
=
prepare_inputs_for_correctness_test
(
bench_args
,
tokenizer
)
rank_print
(
f
"
{
input_ids
=
}
"
)
rank_print
(
f
"
\n
{
input_ids
=
}
\n
"
)
if
bench_args
.
cut_len
>
0
:
# Prefill
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
rank_print
(
"prefill logits (first half)
"
,
next_token_logits
)
rank_print
(
f
"prefill logits (first half)
:
{
next_token_logits
}
\n
"
)
# Prepare extend inputs
reqs
=
prepare_extend_inputs_for_correctness_test
(
...
...
@@ -239,7 +247,7 @@ def correctness_test(
# Extend
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
rank_print
(
"prefill logits (final)
"
,
next_token_logits
)
rank_print
(
f
"prefill logits (final)
:
{
next_token_logits
}
\n
"
)
# Decode
output_ids
=
[
input_ids
[
i
]
+
[
next_token_ids
[
i
]]
for
i
in
range
(
len
(
input_ids
))]
...
...
@@ -250,7 +258,8 @@ def correctness_test(
# Print
for
i
in
range
(
len
(
reqs
)):
rank_print
(
tokenizer
.
decode
(
output_ids
[
i
]))
rank_print
(
f
"========== Prompt
{
i
}
=========="
)
rank_print
(
tokenizer
.
decode
(
output_ids
[
i
]),
"
\n
"
)
@
torch
.
inference_mode
()
...
...
test/srt/test_moe_eval_accuracy_large.py
0 → 100644
View file @
1b5d56f7
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--tp"
,
"2"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_child_process
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.63
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.43
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_latency.py
0 → 100644
View file @
1b5d56f7
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MOE_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
"--tp"
,
"2"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
125
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_throughput.py
View file @
1b5d56f7
...
...
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args
.
append
(
"--disable-flashinfer"
)
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
other_args
.
extend
([
"--tensor-parallel-size"
,
"2"
])
other_args
.
append
(
"--enable-p2p-check"
)
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
...
...
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts
=
2
00
num_prompts
=
3
00
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
...
...
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 950, H100 (SMX): 1800
assert
res
[
"output_throughput"
]
>
1750
assert
res
[
"output_throughput"
]
>
1850
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
...
...
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 950, H100 (SMX): 1900
assert
res
[
"output_throughput"
]
>
1850
def
test_all_cases
(
self
):
for
disable_radix_cache
in
[
False
,
True
]:
for
disable_flashinfer
in
[
False
,
True
]:
for
chunked_prefill_size
in
[
-
1
,
2048
]:
self
.
run_test
(
disable_radix_cache
=
False
,
disable_flashinfer
=
False
,
chunked_prefill_size
=-
1
,
)
assert
res
[
"output_throughput"
]
>
1950
if
__name__
==
"__main__"
:
...
...
test/srt/test_serving_latency.py
0 → 100644
View file @
1b5d56f7
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
130
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_serving_throughput.py
View file @
1b5d56f7
...
...
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts
=
4
00
num_prompts
=
5
00
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
...
...
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1450, H100 (SMX): 2550
assert
res
[
"output_throughput"
]
>
2500
assert
res
[
"output_throughput"
]
>
2400
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
...
...
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1500, H100 (SMX): 2850
assert
res
[
"output_throughput"
]
>
2800
def
test_default_without_chunked_prefill
(
self
):
...
...
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1450, H100 (SMX): 2550
assert
res
[
"output_throughput"
]
>
2500
def
test_all_cases
(
self
):
for
disable_radix_cache
in
[
False
,
True
]:
for
disable_flashinfer
in
[
False
,
True
]:
for
chunked_prefill_size
in
[
-
1
,
2048
]:
self
.
run_test
(
disable_radix_cache
=
False
,
disable_flashinfer
=
False
,
chunked_prefill_size
=-
1
,
)
assert
res
[
"output_throughput"
]
>
2400
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment