Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
1b5d56f7
"stubs/torch/vscode:/vscode.git/clone" did not exist on "f2af4c66c70c0f673ddc7532b8c3db6954d67706"
Unverified
Commit
1b5d56f7
authored
Sep 01, 2024
by
Lianmin Zheng
Committed by
GitHub
Sep 01, 2024
Browse files
[CI] Add more multi-gpu tests (#1280)
parent
d134c139
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
271 additions
and
128 deletions
+271
-128
.github/workflows/accuracy-test.yml
.github/workflows/accuracy-test.yml
+32
-1
.github/workflows/cache-purge.yml
.github/workflows/cache-purge.yml
+0
-27
.github/workflows/e2e-test.yml
.github/workflows/e2e-test.yml
+42
-2
.github/workflows/moe-test.yml
.github/workflows/moe-test.yml
+0
-45
.github/workflows/unit-test.yml
.github/workflows/unit-test.yml
+5
-5
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+25
-16
test/srt/test_moe_eval_accuracy_large.py
test/srt/test_moe_eval_accuracy_large.py
+73
-0
test/srt/test_moe_serving_latency.py
test/srt/test_moe_serving_latency.py
+45
-0
test/srt/test_moe_serving_throughput.py
test/srt/test_moe_serving_throughput.py
+3
-16
test/srt/test_serving_latency.py
test/srt/test_serving_latency.py
+43
-0
test/srt/test_serving_throughput.py
test/srt/test_serving_throughput.py
+3
-16
No files found.
.github/workflows/accuracy-test.yml
View file @
1b5d56f7
...
@@ -18,7 +18,7 @@ concurrency:
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
cancel-in-progress
:
true
jobs
:
jobs
:
accuracy-test
:
one-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
runs-on
:
1-gpu-runner
...
@@ -41,3 +41,34 @@ jobs:
...
@@ -41,3 +41,34 @@ jobs:
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 test_eval_accuracy_large.py
python3 test_eval_accuracy_large.py
two-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
-
name
:
Evaluate Accuracy
timeout-minutes
:
20
run
:
|
cd test/srt
python3 test_moe_eval_accuracy_large.py
finish
:
needs
:
[
one-gpu
,
two-gpu
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
.github/workflows/cache-purge.yml
deleted
100644 → 0
View file @
d134c139
name
:
Weekly Cache Purge
on
:
schedule
:
-
cron
:
'
0
0
*
*
0'
# Every Sunday at 00:00
workflow_dispatch
:
jobs
:
purge-cache
:
if
:
github.repository == 'sgl-project/sglang'
runs-on
:
self-hosted
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Purge pip cache
run
:
|
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip cache purge
-
name
:
Update dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
.github/workflows/e2e-test.yml
View file @
1b5d56f7
...
@@ -18,7 +18,7 @@ concurrency:
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
cancel-in-progress
:
true
jobs
:
jobs
:
e2e-test
:
one-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
runs-on
:
1-gpu-runner
...
@@ -41,7 +41,8 @@ jobs:
...
@@ -41,7 +41,8 @@ jobs:
-
name
:
Benchmark Serving Latency
-
name
:
Benchmark Serving Latency
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
-
name
:
Benchmark Serving Throughput (w/o RadixAttention)
-
name
:
Benchmark Serving Throughput (w/o RadixAttention)
timeout-minutes
:
10
timeout-minutes
:
10
...
@@ -54,3 +55,42 @@ jobs:
...
@@ -54,3 +55,42 @@ jobs:
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
two-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark Serving Throughput (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
name
:
Benchmark Serving Latency (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
-
name
:
Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
finish
:
needs
:
[
one-gpu
,
two-gpu
]
runs-on
:
ubuntu-latest
steps
:
-
name
:
Finish
run
:
echo "This is an empty step to ensure that all jobs are completed."
.github/workflows/moe-test.yml
deleted
100644 → 0
View file @
d134c139
name
:
MoE Test
on
:
push
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
pull_request
:
branches
:
[
main
]
paths
:
-
"
python/sglang/**"
-
"
test/**"
workflow_dispatch
:
concurrency
:
group
:
moe-test-${{ github.ref }}
cancel-in-progress
:
true
jobs
:
moe-test
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
2-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark MoE Serving Throughput
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-
name
:
Benchmark MoE Serving Throughput (w/o RadixAttention)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
.github/workflows/unit-test.yml
View file @
1b5d56f7
...
@@ -18,7 +18,7 @@ concurrency:
...
@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress
:
true
cancel-in-progress
:
true
jobs
:
jobs
:
un
it
-test
-jobs
:
r
un-test
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
runs-on
:
1-gpu-runner
strategy
:
strategy
:
...
@@ -48,9 +48,9 @@ jobs:
...
@@ -48,9 +48,9 @@ jobs:
python3 run_suite.py --suite minimal --range-begin 8
python3 run_suite.py --suite minimal --range-begin 8
fi
fi
unit-test
:
finish
:
needs
:
un
it
-test
-jobs
needs
:
[
r
un-test
]
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Merge step
-
name
:
Finish
run
:
echo "This is an empty merge step"
run
:
echo "This is an empty step to ensure that all jobs are completed."
\ No newline at end of file
python/sglang/bench_latency.py
View file @
1b5d56f7
...
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
...
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
## plot the results in series of lines:
## plot the results in series of lines:
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
# Usage (correctness test):
# Usage (correctness test):
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
## Reference output (of the correctness test above, can be gpu dependent):
## Reference output (of the correctness test above, can be gpu dependent):
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
device='cuda:0', dtype=torch.float16)
[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141],
[ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
[-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742],
device='cuda:0')
[-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
device='cuda:0', dtype=torch.float16)
prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
<s> The capital of France is.
[-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
[-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
device='cuda:0')
========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C.
The capital of the United States is Washington, D.C.
<s> The capital of the United Kindom is.
========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London.
The capital of the United Kingdom is London.
The capital of the
The capital of the
<s> Today is a sunny day and I like go for a walk in the park.
========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park
I'm going to the park
"""
"""
...
@@ -225,12 +233,12 @@ def correctness_test(
...
@@ -225,12 +233,12 @@ def correctness_test(
# Prepare inputs
# Prepare inputs
input_ids
,
reqs
=
prepare_inputs_for_correctness_test
(
bench_args
,
tokenizer
)
input_ids
,
reqs
=
prepare_inputs_for_correctness_test
(
bench_args
,
tokenizer
)
rank_print
(
f
"
{
input_ids
=
}
"
)
rank_print
(
f
"
\n
{
input_ids
=
}
\n
"
)
if
bench_args
.
cut_len
>
0
:
if
bench_args
.
cut_len
>
0
:
# Prefill
# Prefill
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
rank_print
(
"prefill logits (first half)
"
,
next_token_logits
)
rank_print
(
f
"prefill logits (first half)
:
{
next_token_logits
}
\n
"
)
# Prepare extend inputs
# Prepare extend inputs
reqs
=
prepare_extend_inputs_for_correctness_test
(
reqs
=
prepare_extend_inputs_for_correctness_test
(
...
@@ -239,7 +247,7 @@ def correctness_test(
...
@@ -239,7 +247,7 @@ def correctness_test(
# Extend
# Extend
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
next_token_ids
,
next_token_logits
,
batch
=
extend
(
reqs
,
model_runner
)
rank_print
(
"prefill logits (final)
"
,
next_token_logits
)
rank_print
(
f
"prefill logits (final)
:
{
next_token_logits
}
\n
"
)
# Decode
# Decode
output_ids
=
[
input_ids
[
i
]
+
[
next_token_ids
[
i
]]
for
i
in
range
(
len
(
input_ids
))]
output_ids
=
[
input_ids
[
i
]
+
[
next_token_ids
[
i
]]
for
i
in
range
(
len
(
input_ids
))]
...
@@ -250,7 +258,8 @@ def correctness_test(
...
@@ -250,7 +258,8 @@ def correctness_test(
# Print
# Print
for
i
in
range
(
len
(
reqs
)):
for
i
in
range
(
len
(
reqs
)):
rank_print
(
tokenizer
.
decode
(
output_ids
[
i
]))
rank_print
(
f
"========== Prompt
{
i
}
=========="
)
rank_print
(
tokenizer
.
decode
(
output_ids
[
i
]),
"
\n
"
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
...
test/srt/test_moe_eval_accuracy_large.py
0 → 100644
View file @
1b5d56f7
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.run_eval
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--log-level-http"
,
"warning"
,
"--tp"
,
"2"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_child_process
(
cls
.
process
.
pid
)
def
test_mmlu
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mmlu"
,
num_examples
=
3000
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.63
,
f
"
{
metrics
}
"
def
test_human_eval
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"humaneval"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.43
,
f
"
{
metrics
}
"
def
test_mgsm_en
(
self
):
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
model
=
self
.
model
,
eval_name
=
"mgsm_en"
,
num_examples
=
None
,
num_threads
=
1024
,
)
metrics
=
run_eval
(
args
)
assert
metrics
[
"score"
]
>=
0.64
,
f
"
{
metrics
}
"
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_latency.py
0 → 100644
View file @
1b5d56f7
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MOE_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
"--tp"
,
"2"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
125
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_throughput.py
View file @
1b5d56f7
...
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args
.
append
(
"--disable-flashinfer"
)
other_args
.
append
(
"--disable-flashinfer"
)
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
other_args
.
extend
([
"--tensor-parallel-size"
,
"2"
])
other_args
.
extend
([
"--tensor-parallel-size"
,
"2"
])
other_args
.
append
(
"--enable-p2p-check"
)
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
...
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
# Run benchmark
# Run benchmark
num_prompts
=
2
00
num_prompts
=
3
00
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
backend
=
"sglang"
,
backend
=
"sglang"
,
base_url
=
base_url
,
base_url
=
base_url
,
...
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 950, H100 (SMX): 1800
assert
res
[
"output_throughput"
]
>
1850
assert
res
[
"output_throughput"
]
>
1750
def
test_default_without_radix_cache
(
self
):
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
res
=
self
.
run_test
(
...
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 950, H100 (SMX): 1900
assert
res
[
"output_throughput"
]
>
1950
assert
res
[
"output_throughput"
]
>
1850
def
test_all_cases
(
self
):
for
disable_radix_cache
in
[
False
,
True
]:
for
disable_flashinfer
in
[
False
,
True
]:
for
chunked_prefill_size
in
[
-
1
,
2048
]:
self
.
run_test
(
disable_radix_cache
=
False
,
disable_flashinfer
=
False
,
chunked_prefill_size
=-
1
,
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
test/srt/test_serving_latency.py
0 → 100644
View file @
1b5d56f7
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
130
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_serving_throughput.py
View file @
1b5d56f7
...
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
# Run benchmark
# Run benchmark
num_prompts
=
4
00
num_prompts
=
5
00
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
backend
=
"sglang"
,
backend
=
"sglang"
,
base_url
=
base_url
,
base_url
=
base_url
,
...
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1450, H100 (SMX): 2550
assert
res
[
"output_throughput"
]
>
2400
assert
res
[
"output_throughput"
]
>
2500
def
test_default_without_radix_cache
(
self
):
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
res
=
self
.
run_test
(
...
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
)
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1500, H100 (SMX): 2850
assert
res
[
"output_throughput"
]
>
2800
assert
res
[
"output_throughput"
]
>
2800
def
test_default_without_chunked_prefill
(
self
):
def
test_default_without_chunked_prefill
(
self
):
...
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
...
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
)
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
# A100 (PCIE): 1450, H100 (SMX): 2550
assert
res
[
"output_throughput"
]
>
2400
assert
res
[
"output_throughput"
]
>
2500
def
test_all_cases
(
self
):
for
disable_radix_cache
in
[
False
,
True
]:
for
disable_flashinfer
in
[
False
,
True
]:
for
chunked_prefill_size
in
[
-
1
,
2048
]:
self
.
run_test
(
disable_radix_cache
=
False
,
disable_flashinfer
=
False
,
chunked_prefill_size
=-
1
,
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment