Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fc601665
Unverified
Commit
fc601665
authored
Dec 24, 2024
by
Jiaxin Shan
Committed by
GitHub
Dec 25, 2024
Browse files
[Misc] Update disaggregation benchmark scripts and test logs (#11456)
Signed-off-by:
Jiaxin Shan
<
seedjeffwan@gmail.com
>
parent
9832e557
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
43 additions
and
29 deletions
+43
-29
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+7
-6
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+6
-7
tests/kv_transfer/test_lookup_buffer.py
tests/kv_transfer/test_lookup_buffer.py
+2
-2
tests/kv_transfer/test_lookup_buffer.sh
tests/kv_transfer/test_lookup_buffer.sh
+7
-2
tests/kv_transfer/test_send_recv.py
tests/kv_transfer/test_send_recv.py
+14
-11
tests/kv_transfer/test_send_recv.sh
tests/kv_transfer/test_send_recv.sh
+7
-1
No files found.
benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
View file @
fc601665
...
...
@@ -10,7 +10,8 @@ set -ex
kill_gpu_processes
()
{
# kill all processes on GPU.
pkill
-f
pt_main_thread
pgrep pt_main_thread | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
sleep
10
# remove vllm config file
...
...
@@ -54,7 +55,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
0 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8100
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
...
...
@@ -64,7 +65,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES
=
1 python3
\
-m
vllm.entrypoints.openai.api_server
\
--model
meta-llama/Meta-Llama-3.1-8B-Instruct
\
--model
$model
\
--port
8200
\
--max-model-len
10000
\
--gpu-memory-utilization
0.6
\
...
...
@@ -87,7 +88,7 @@ benchmark() {
--port
8100
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2x
tp
4
.json
\
--result-filename
disagg_prefill_tp
1
.json
\
--request-rate
"inf"
...
...
@@ -105,7 +106,7 @@ benchmark() {
--port
8200
\
--save-result
\
--result-dir
$results_folder
\
--result-filename
disagg_prefill_
2xtp4
.json
\
--result-filename
disagg_prefill_
tp1_overhead
.json
\
--request-rate
"
$qps
"
kill_gpu_processes
...
...
@@ -118,7 +119,7 @@ main() {
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
pip
install
quart httpx
pip
install
quart httpx
datasets
cd
"
$(
dirname
"
$0
"
)
"
...
...
benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
View file @
fc601665
#!/bin/bash
# Requirement:
8x H100
GPUs.
# Requirement:
2x
GPUs.
# Model:
neuralmagic
/Meta-Llama-3
-70
B-Instruct
-FP8-KV
# Query:
2048
input tokens,
11
output tokens, QPS
4
,
5
00 requests
# Resource:
8
x
H100
# Model:
meta-llama
/Meta-Llama-3
.1-8
B-Instruct
# Query:
1024
input tokens,
6
output tokens, QPS
2/4/6/8
,
1
00 requests
# Resource:
2
x
GPU
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
...
...
@@ -114,7 +113,6 @@ benchmark() {
--request-rate
"
$qps
"
sleep
2
}
...
...
@@ -123,8 +121,9 @@ main() {
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get
-y
install
jq
)
(
which socat
)
||
(
apt-get
-y
install
socat
)
(
which lsof
)
||
(
apt-get
-y
install
lsof
)
pip
install
quart httpx matplotlib aiohttp
pip
install
quart httpx matplotlib aiohttp
datasets
cd
"
$(
dirname
"
$0
"
)
"
...
...
tests/kv_transfer/test_lookup_buffer.py
View file @
fc601665
...
...
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert
buffer
.
buffer_size
==
0
assert
len
(
buffer
.
buffer
)
==
0
print
(
"Test run passed!"
)
print
(
"
My rank: %d,
Test run passed!"
%
(
my_rank
)
)
def
stress_test
(
my_rank
,
buf
,
device
):
...
...
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else
:
torch
.
distributed
.
send
(
torch
.
tensor
([
n
]),
0
)
print
(
"Passed stress test!"
)
print
(
"
My rank: %d,
Passed stress test!"
%
(
my_rank
)
)
if
__name__
==
"__main__"
:
...
...
tests/kv_transfer/test_lookup_buffer.sh
View file @
fc601665
#!/bin/bash
RANK
=
0 python test_lookup_buffer.py &
RANK
=
1 python test_lookup_buffer.py &
\ No newline at end of file
RANK
=
0 python3 test_lookup_buffer.py &
PID0
=
$!
RANK
=
1 python3 test_lookup_buffer.py &
PID1
=
$!
wait
$PID0
wait
$PID1
tests/kv_transfer/test_send_recv.py
View file @
fc601665
...
...
@@ -10,39 +10,42 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
def
test_run
(
my_rank
,
pipe
):
print
(
f
"rank
{
my_rank
}
test_run starts...."
)
# test run
x
=
torch
.
tensor
([
1
]).
to
(
pipe
.
device
)
y
=
torch
.
tensor
([[
2.
,
3.
,
4.
,
8.
]]).
to
(
pipe
.
device
)
if
my_rank
==
0
:
pipe
.
send_tensor
(
x
)
print
(
"
sent tensor x"
)
print
(
f
"rank
{
my_rank
}
sent tensor x"
)
pipe
.
send_tensor
(
y
)
print
(
"
sent tensor y"
)
print
(
f
"rank
{
my_rank
}
sent tensor y"
)
x2
=
pipe
.
recv_tensor
()
print
(
"
received x2 = "
,
x2
)
print
(
f
"rank
{
my_rank
}
received x2 = "
,
x2
)
y2
=
pipe
.
recv_tensor
()
print
(
"
received y2 = "
,
x2
)
print
(
f
"rank
{
my_rank
}
received y2 = "
,
x2
)
else
:
x2
=
pipe
.
recv_tensor
()
print
(
"
received x2 = "
,
x2
)
print
(
f
"rank
{
my_rank
}
received x2 = "
,
x2
)
y2
=
pipe
.
recv_tensor
()
print
(
"
received y2 = "
,
x2
)
print
(
f
"rank
{
my_rank
}
received y2 = "
,
x2
)
pipe
.
send_tensor
(
x
)
print
(
"
sent tensor x"
)
print
(
f
"rank
{
my_rank
}
sent tensor x"
)
pipe
.
send_tensor
(
y
)
print
(
"
sent tensor y"
)
print
(
f
"rank
{
my_rank
}
sent tensor y"
)
assert
torch
.
allclose
(
x
,
x2
)
assert
torch
.
allclose
(
y
,
y2
)
print
(
f
"rank
{
my_rank
}
test_run passed!"
)
def
stress_test
(
my_rank
,
pipe
):
torch
.
distributed
.
barrier
()
def
stress_test
(
my_rank
,
pipe
):
print
(
f
"rank
{
my_rank
}
stress_test starts...."
)
tensors
:
List
[
torch
.
Tensor
]
=
[]
torch
.
distributed
.
barrier
()
torch
.
manual_seed
(
0
)
for
i
in
tqdm
(
range
(
500
)):
...
...
@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
def
latency_test
(
my_rank
,
pipe
,
nelement
,
ntensor
):
latencies
=
[]
torch
.
distributed
.
barrier
()
...
...
@@ -149,6 +151,7 @@ if __name__ == "__main__":
)
test_run
(
my_rank
,
pipe
)
stress_test
(
my_rank
,
pipe
)
# Use this function if you want to test the latency of pipe impl.
...
...
tests/kv_transfer/test_send_recv.sh
View file @
fc601665
#!/bin/bash
RANK
=
0 python3 test_send_recv.py &
RANK
=
1 python3 test_send_recv.py &
\ No newline at end of file
PID0
=
$!
RANK
=
1 python3 test_send_recv.py &
PID1
=
$!
wait
$PID0
wait
$PID1
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment