Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
68be2f6d
"tests/python/vscode:/vscode.git/clone" did not exist on "1d86b796a1575117b1c3f0c69569cf154120c437"
Unverified
Commit
68be2f6d
authored
Sep 12, 2024
by
Lianmin Zheng
Committed by
GitHub
Sep 12, 2024
Browse files
[CI] Include triton backend and online serving benchmark into CI (#1408)
parent
b912de11
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
270 additions
and
307 deletions
+270
-307
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+42
-16
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+46
-0
test/srt/test_bench_latency.py
test/srt/test_bench_latency.py
+83
-0
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+99
-0
test/srt/test_moe_serving_latency.py
test/srt/test_moe_serving_latency.py
+0
-45
test/srt/test_moe_serving_throughput.py
test/srt/test_moe_serving_throughput.py
+0
-92
test/srt/test_serving_latency.py
test/srt/test_serving_latency.py
+0
-43
test/srt/test_serving_throughput.py
test/srt/test_serving_throughput.py
+0
-111
No files found.
.github/workflows/pr-test.yml
View file @
68be2f6d
...
@@ -75,7 +75,7 @@ jobs:
...
@@ -75,7 +75,7 @@ jobs:
cd test/srt
cd test/srt
python3 run_suite.py --suite minimal --range-begin 8
python3 run_suite.py --suite minimal --range-begin 8
performance-test-1-gpu
:
performance-test-1-gpu
-part-1
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
runs-on
:
1-gpu-runner
steps
:
steps
:
...
@@ -88,29 +88,54 @@ jobs:
...
@@ -88,29 +88,54 @@ jobs:
pip install -e "python[all]"
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark
Serv
in
g
Throughput
-
name
:
Benchmark
Offl
in
e
Throughput
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_serving
_throughput.TestServingT
hroughput
.test
_default
python3 -m unittest test_
bench_
serving
.TestBenchServing.test_offline_t
hroughput_default
-
name
:
Benchmark
Serving Latency
-
name
:
Benchmark
Offline Throughput (w/o RadixAttention)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_serving
_latency.TestServingLatency.test_default
python3 -m unittest test_
bench_
serving
.TestBenchServing.test_offline_throughput_without_radix_cache
-
name
:
Benchmark
Serv
in
g
Throughput (w/o
RadixAttention
)
-
name
:
Benchmark
Offl
in
e
Throughput (w/o
ChunkedPrefill
)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_serving
_throughput.TestServingThroughput.test_default_without_radix_cache
python3 -m unittest test_
bench_
serving
.TestBenchServing.test_offline_throughput_without_chunked_prefill
-
name
:
Benchmark
Serv
in
g
Throughput (w/
o ChunkedPrefill
)
-
name
:
Benchmark
Offl
in
e
Throughput (w/
Triton
)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
performance-test-1-gpu-part-2
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v3
-
name
:
Install dependencies
run
:
|
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark Single Latency
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_latency.TestBenchLatency.test_default
-
name
:
Benchmark Online Latency
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
performance-test-2-gpu
:
performance-test-2-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
...
@@ -125,23 +150,24 @@ jobs:
...
@@ -125,23 +150,24 @@ jobs:
pip install -e "python[all]"
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-
name
:
Benchmark
Serv
in
g
Throughput (TP=2)
-
name
:
Benchmark
Offl
in
e
Throughput (TP=2)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_
moe
_serving
_throughput.TestServingT
hroughput
.test
_default
python3 -m unittest test_
bench
_serving
.TestBenchServing.test_moe_offline_t
hroughput_default
-
name
:
Benchmark
Serving Latency
(TP=2)
-
name
:
Benchmark
Offline Throughput (w/o RadixAttention)
(TP=2)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_
moe
_serving
_latency.TestServingLatency.test_default
python3 -m unittest test_
bench
_serving
.TestBenchServing.test_moe_offline_throughput_without_radix_cache
-
name
:
Benchmark S
erving Throughput (w/o RadixAttention)
(TP=2)
-
name
:
Benchmark S
ingle Latency
(TP=2)
timeout-minutes
:
10
timeout-minutes
:
10
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default
accuracy-test-1-gpu
:
accuracy-test-1-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
...
@@ -192,7 +218,7 @@ jobs:
...
@@ -192,7 +218,7 @@ jobs:
finish
:
finish
:
needs
:
[
needs
:
[
unit-test-frontend
,
unit-test-backend-part-0
,
unit-test-backend-part-1
,
unit-test-frontend
,
unit-test-backend-part-0
,
unit-test-backend-part-1
,
performance-test-1-gpu
,
performance-test-2-gpu
,
performance-test-1-gpu
-part-1
,
performance-test-1-gpu-part-2
,
performance-test-2-gpu
,
accuracy-test-1-gpu
,
accuracy-test-2-gpu
accuracy-test-1-gpu
,
accuracy-test-2-gpu
]
]
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
...
...
python/sglang/test/test_utils.py
View file @
68be2f6d
...
@@ -7,6 +7,7 @@ import subprocess
...
@@ -7,6 +7,7 @@ import subprocess
import
threading
import
threading
import
time
import
time
from
functools
import
partial
from
functools
import
partial
from
types
import
SimpleNamespace
from
typing
import
Callable
,
List
,
Optional
from
typing
import
Callable
,
List
,
Optional
import
numpy
as
np
import
numpy
as
np
...
@@ -14,6 +15,7 @@ import requests
...
@@ -14,6 +15,7 @@ import requests
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
sglang.bench_serving
import
run_benchmark
from
sglang.global_config
import
global_config
from
sglang.global_config
import
global_config
from
sglang.lang.backend.openai
import
OpenAI
from
sglang.lang.backend.openai
import
OpenAI
from
sglang.lang.backend.runtime_endpoint
import
RuntimeEndpoint
from
sglang.lang.backend.runtime_endpoint
import
RuntimeEndpoint
...
@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
...
@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
def
get_similarities
(
vec1
,
vec2
):
def
get_similarities
(
vec1
,
vec2
):
return
F
.
cosine_similarity
(
torch
.
tensor
(
vec1
),
torch
.
tensor
(
vec2
),
dim
=
0
)
return
F
.
cosine_similarity
(
torch
.
tensor
(
vec1
),
torch
.
tensor
(
vec2
),
dim
=
0
)
def
run_bench_serving
(
model
,
num_prompts
,
request_rate
,
other_server_args
):
# Launch the server
base_url
=
DEFAULT_URL_FOR_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_server_args
,
)
# Run benchmark
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
host
=
None
,
port
=
None
,
dataset_name
=
"random"
,
dataset_path
=
""
,
model
=
None
,
tokenizer
=
None
,
num_prompts
=
num_prompts
,
sharegpt_output_len
=
None
,
random_input_len
=
4096
,
random_output_len
=
2048
,
random_range_ratio
=
0.0
,
request_rate
=
request_rate
,
multi
=
None
,
seed
=
0
,
output_file
=
None
,
disable_tqdm
=
False
,
disable_stream
=
False
,
disable_ignore_eos
=
False
,
extra_request_body
=
None
,
)
try
:
res
=
run_benchmark
(
args
)
finally
:
kill_child_process
(
process
.
pid
)
assert
res
[
"completed"
]
==
num_prompts
return
res
test/srt/test_bench_latency.py
0 → 100644
View file @
68be2f6d
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
)
class
TestBenchLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model-path"
,
DEFAULT_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
try
:
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
130
finally
:
kill_child_process
(
process
.
pid
)
def
test_moe_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
"--tp"
,
"2"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
try
:
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
125
finally
:
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_bench_serving.py
0 → 100644
View file @
68be2f6d
import
os
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
run_bench_serving
,
)
class
TestBenchServing
(
unittest
.
TestCase
):
def
test_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
500
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2600
def
test_offline_throughput_without_radix_cache
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
500
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--disable-radix-cache"
],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2800
def
test_offline_throughput_without_chunked_prefill
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
500
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--chunked-prefill-size"
,
"-1"
],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2600
def
test_offline_throughput_with_triton_attention_backend
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
500
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--attention-backend"
,
"triton"
,
"--context-length"
,
"8192"
,
],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2600
def
test_online_latency_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
100
,
request_rate
=
1
,
other_server_args
=
[],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"median_e2e_latency_ms"
]
<
12000
assert
res
[
"median_ttft_ms"
]
<
78
assert
res
[
"median_itl_ms"
]
<
12
def
test_moe_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
num_prompts
=
300
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--tp"
,
"2"
],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
1850
def
test_moe_offline_throughput_without_radix_cache
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
num_prompts
=
300
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--tp"
,
"2"
,
"--disable-radix-cache"
],
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
1950
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_latency.py
deleted
100644 → 0
View file @
b912de11
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MOE_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model"
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
"--tp"
,
"2"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
125
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_moe_serving_throughput.py
deleted
100644 → 0
View file @
b912de11
import
os
import
unittest
from
types
import
SimpleNamespace
from
sglang.bench_serving
import
run_benchmark
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestServingThroughput
(
unittest
.
TestCase
):
def
run_test
(
self
,
disable_radix_cache
,
attention_backend
,
chunked_prefill_size
):
# Launch the server
other_args
=
[]
if
disable_radix_cache
:
other_args
.
append
(
"--disable-radix-cache"
)
if
attention_backend
:
other_args
.
extend
([
"--attention-backend"
,
attention_backend
])
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
other_args
.
extend
([
"--tensor-parallel-size"
,
"2"
])
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
# Run benchmark
num_prompts
=
300
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
host
=
None
,
port
=
None
,
dataset_name
=
"random"
,
dataset_path
=
""
,
model
=
None
,
tokenizer
=
None
,
num_prompts
=
num_prompts
,
sharegpt_output_len
=
None
,
random_input_len
=
4096
,
random_output_len
=
2048
,
random_range_ratio
=
0.0
,
request_rate
=
float
(
"inf"
),
multi
=
None
,
seed
=
0
,
output_file
=
None
,
disable_tqdm
=
False
,
disable_stream
=
False
,
disable_ignore_eos
=
False
,
extra_request_body
=
None
,
)
try
:
res
=
run_benchmark
(
args
)
finally
:
kill_child_process
(
process
.
pid
)
assert
res
[
"completed"
]
==
num_prompts
return
res
def
test_default
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
attention_backend
=
ServerArgs
.
attention_backend
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
1800
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
True
,
attention_backend
=
ServerArgs
.
attention_backend
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
1950
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_serving_latency.py
deleted
100644 → 0
View file @
b912de11
import
os
import
subprocess
import
unittest
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
DEFAULT_MODEL_NAME_FOR_TEST
class
TestServingLatency
(
unittest
.
TestCase
):
def
test_default
(
self
):
command
=
[
"python3"
,
"-m"
,
"sglang.bench_latency"
,
"--model-path"
,
DEFAULT_MODEL_NAME_FOR_TEST
,
"--batch-size"
,
"1"
,
"--input"
,
"128"
,
"--output"
,
"8"
,
]
process
=
subprocess
.
Popen
(
command
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
stdout
,
stderr
=
process
.
communicate
()
output
=
stdout
.
decode
()
error
=
stderr
.
decode
()
print
(
f
"Output:
{
output
}
"
)
print
(
f
"Error:
{
error
}
"
)
lastline
=
output
.
split
(
"
\n
"
)[
-
3
]
value
=
float
(
lastline
.
split
(
" "
)[
-
2
])
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
value
>
130
kill_child_process
(
process
.
pid
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_serving_throughput.py
deleted
100644 → 0
View file @
b912de11
import
os
import
unittest
from
types
import
SimpleNamespace
from
sglang.bench_serving
import
run_benchmark
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestServingThroughput
(
unittest
.
TestCase
):
def
run_test
(
self
,
disable_radix_cache
,
attention_backend
,
chunked_prefill_size
):
# Launch the server
other_args
=
[]
if
disable_radix_cache
:
other_args
.
append
(
"--disable-radix-cache"
)
if
attention_backend
:
other_args
.
extend
([
"--attention-backend"
,
attention_backend
])
other_args
.
extend
([
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)])
model
=
DEFAULT_MODEL_NAME_FOR_TEST
base_url
=
DEFAULT_URL_FOR_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
# Run benchmark
num_prompts
=
500
args
=
SimpleNamespace
(
backend
=
"sglang"
,
base_url
=
base_url
,
host
=
None
,
port
=
None
,
dataset_name
=
"random"
,
dataset_path
=
""
,
model
=
None
,
tokenizer
=
None
,
num_prompts
=
num_prompts
,
sharegpt_output_len
=
None
,
random_input_len
=
4096
,
random_output_len
=
2048
,
random_range_ratio
=
0.0
,
request_rate
=
float
(
"inf"
),
multi
=
None
,
seed
=
0
,
output_file
=
None
,
disable_tqdm
=
False
,
disable_stream
=
False
,
disable_ignore_eos
=
False
,
extra_request_body
=
None
,
)
try
:
res
=
run_benchmark
(
args
)
finally
:
kill_child_process
(
process
.
pid
)
assert
res
[
"completed"
]
==
num_prompts
return
res
def
test_default
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
attention_backend
=
ServerArgs
.
attention_backend
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2400
def
test_default_without_radix_cache
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
True
,
attention_backend
=
ServerArgs
.
attention_backend
,
chunked_prefill_size
=
ServerArgs
.
chunked_prefill_size
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2800
def
test_default_without_chunked_prefill
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
attention_backend
=
ServerArgs
.
attention_backend
,
chunked_prefill_size
=-
1
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2400
def
test_default_with_triton_attention_backend
(
self
):
res
=
self
.
run_test
(
disable_radix_cache
=
ServerArgs
.
disable_radix_cache
,
attention_backend
=
"triton"
,
chunked_prefill_size
=-
1
,
)
if
os
.
getenv
(
"SGLANG_IS_IN_CI"
,
"false"
)
==
"true"
:
assert
res
[
"output_throughput"
]
>
2400
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment