Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
easystart_v0.2
Commits
f6a338d7
Commit
f6a338d7
authored
Jul 16, 2025
by
jerrrrry
Browse files
Initial commit
parents
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
2973 additions
and
0 deletions
+2973
-0
3_env_check-batch_onlinetests/Dockerfile
3_env_check-batch_onlinetests/Dockerfile
+42
-0
3_env_check-batch_onlinetests/configs/model_to_test.cfg
3_env_check-batch_onlinetests/configs/model_to_test.cfg
+26
-0
3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
...check-batch_onlinetests/env_check_tools/dcu_env_check.zip
+0
-0
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
+0
-0
3_env_check-batch_onlinetests/scripts/backend_request_func.py
...v_check-batch_onlinetests/scripts/backend_request_func.py
+505
-0
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
+817
-0
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
+1088
-0
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
+69
-0
3_env_check-batch_onlinetests/scripts/entrypoint.sh
3_env_check-batch_onlinetests/scripts/entrypoint.sh
+12
-0
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
+166
-0
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
+163
-0
3_env_check-batch_onlinetests/scripts/test.sh
3_env_check-batch_onlinetests/scripts/test.sh
+64
-0
3_env_check-batch_onlinetests/start.sh
3_env_check-batch_onlinetests/start.sh
+21
-0
No files found.
3_env_check-batch_onlinetests/Dockerfile
0 → 100644
View file @
f6a338d7
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
sysstat
\
locate
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
&&
\
mkdir
-p
/workspace/test/models
&&
\
mkdir
-p
/workspace/test/env_check_tools
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
COPY
./env_check_tools/dcu_env_check.zip /workspace/test/env_check_tools/
COPY
./env_check_tools/rccl-tests.zip /workspace/test/env_check_tools/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
3_env_check-batch_onlinetests/configs/model_to_test.cfg
0 → 100644
View file @
f6a338d7
Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;4;float16;"1 ";(512 512);32768;0.95
Qwen2.5-VL-7B;/workspace/llms/qwen2.5/Qwen2.5-VL-7B-Instruct/;1;float16;"1 ";(512 512);32768;0.95
#Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms/qwen3/Qwen3-32B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms//qwen3/Qwen3-32B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-4B;/workspace/llms/qwen3/Qwen3-4B/;1;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-235B-A22B;/workspace/llms/qwen3/Qwen3-235B-A22B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024);20000;0.95
3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
0 → 100644
View file @
f6a338d7
File added
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
0 → 100644
View file @
f6a338d7
File added
3_env_check-batch_onlinetests/scripts/backend_request_func.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
import
json
import
os
import
sys
import
time
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
,
Union
import
aiohttp
import
huggingface_hub.constants
from
tqdm.asyncio
import
tqdm
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
model_name
:
Optional
[
str
]
=
None
logprobs
:
Optional
[
int
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0.0
output_tokens
:
int
=
0
ttft
:
float
=
0.0
# Time to first token
itl
:
list
[
float
]
=
field
(
default_factory
=
list
)
# list of inter-token latencies
tpot
:
float
=
0.0
# avg next-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
params
=
{
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
"truncate"
:
request_func_input
.
prompt_len
,
"ignore_eos_token"
:
request_func_input
.
ignore_eos
,
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
if
request_func_input
.
ignore_eos
:
output
.
output_tokens
=
request_func_input
.
output_len
else
:
output
.
output_tokens
=
None
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if
chunk_bytes
.
startswith
(
":"
):
continue
chunk
=
chunk_bytes
.
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
output
.
generated_text
=
data
[
"generated_text"
]
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
if
request_func_input
.
ignore_eos
:
payload
[
"min_length"
]
=
request_func_input
.
output_len
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
output
.
generated_text
+=
data
[
"text_output"
]
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"max_tokens"
:
request_func_input
.
output_len
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temp.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
parsed_resp
=
await
response
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
if
"choices"
in
parsed_resp
:
output
.
generated_text
=
parsed_resp
[
"choices"
][
0
][
"text"
]
elif
"text"
in
parsed_resp
:
output
.
generated_text
=
parsed_resp
[
"text"
][
0
]
else
:
output
.
error
=
(
"Unexpected response format: "
"neither 'choices' nor 'text' found"
)
output
.
success
=
False
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"completions"
,
"profile"
)
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"logprobs"
:
request_func_input
.
logprobs
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
first_chunk_received
=
False
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
data
=
json
.
loads
(
chunk
)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if
choices
:
=
data
.
get
(
"choices"
):
# Note that text could be empty here
# e.g. for special tokens
text
=
choices
[
0
].
get
(
"text"
)
timestamp
=
time
.
perf_counter
()
# First token
if
not
first_chunk_received
:
first_chunk_received
=
True
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
generated_text
+=
text
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
if
first_chunk_received
:
output
.
success
=
True
else
:
output
.
success
=
False
output
.
error
=
(
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!"
)
output
.
generated_text
=
generated_text
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_chat_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"chat/completions"
,
"profile"
)
),
"OpenAI Chat Completions API URL must end with 'chat/completions'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
if
request_func_input
.
multi_modal_content
:
content
.
append
(
request_func_input
.
multi_modal_content
)
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
content
},
],
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
from
vllm.model_executor.model_loader.weight_utils
import
get_lock
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with
get_lock
(
pretrained_model_name_or_path
):
model_path
=
snapshot_download
(
model_id
=
pretrained_model_name_or_path
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
])
return
model_path
return
pretrained_model_name_or_path
def
get_tokenizer
(
pretrained_model_name_or_path
:
str
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
if
pretrained_model_name_or_path
is
not
None
and
not
os
.
path
.
exists
(
pretrained_model_name_or_path
):
pretrained_model_name_or_path
=
get_model
(
pretrained_model_name_or_path
)
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
kwargs
[
"use_fast"
]
=
False
if
tokenizer_mode
==
"mistral"
:
try
:
from
vllm.transformers_utils.tokenizer
import
MistralTokenizer
except
ImportError
as
e
:
raise
ImportError
(
"MistralTokenizer requires vllm package.
\n
"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode."
)
from
e
return
MistralTokenizer
.
from_pretrained
(
str
(
pretrained_model_name_or_path
))
else
:
return
AutoTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_openai_completions
,
"lmdeploy"
:
async_request_openai_completions
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
"""
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
generation. Supported dataset types include:
- ShareGPT
- Random (synthetic)
- Sonnet
- BurstGPT
- HuggingFace
- VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
"""
import
base64
import
io
import
json
import
logging
import
random
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Mapping
from
dataclasses
import
dataclass
from
functools
import
cache
from
io
import
BytesIO
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
numpy
as
np
import
pandas
as
pd
from
datasets
import
load_dataset
from
PIL
import
Image
from
transformers
import
PreTrainedTokenizerBase
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
logger
=
logging
.
getLogger
(
__name__
)
# -----------------------------------------------------------------------------
# Data Classes
# -----------------------------------------------------------------------------
@
dataclass
class
SampleRequest
:
"""
Represents a single inference request for benchmarking.
"""
prompt
:
Union
[
str
,
Any
]
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
Union
[
MultiModalDataDict
,
dict
]]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
# -----------------------------------------------------------------------------
# Benchmark Dataset Base Class
# -----------------------------------------------------------------------------
class
BenchmarkDataset
(
ABC
):
DEFAULT_SEED
=
0
def
__init__
(
self
,
dataset_path
:
Optional
[
str
]
=
None
,
random_seed
:
int
=
DEFAULT_SEED
,
)
->
None
:
"""
Initialize the BenchmarkDataset with an optional dataset path and random
seed. Args:
dataset_path (Optional[str]): Path to the dataset. If None, it
indicates that a default or random dataset might be used.
random_seed (int): Seed value for reproducible shuffling or
sampling. Defaults to DEFAULT_SEED.
"""
self
.
dataset_path
=
dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self
.
random_seed
=
(
random_seed
if
random_seed
is
not
None
else
self
.
DEFAULT_SEED
)
self
.
data
=
None
def
apply_multimodal_chat_transformation
(
self
,
prompt
:
str
,
mm_content
:
Optional
[
MultiModalDataDict
]
=
None
)
->
list
[
dict
]:
"""
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific conversation
format.
"""
content
=
[{
"text"
:
prompt
,
"type"
:
"text"
}]
if
mm_content
is
not
None
:
content
.
append
(
mm_content
)
return
[{
"role"
:
"user"
,
"content"
:
content
}]
def
load_data
(
self
)
->
None
:
"""
Load data from the dataset path into self.data.
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise
NotImplementedError
(
"load_data must be implemented in subclasses."
)
def
get_random_lora_request
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
max_loras
:
Optional
[
int
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
)
->
tuple
[
Optional
[
LoRARequest
],
AnyTokenizer
]:
"""
Optionally select a random LoRA request and return its associated
tokenizer.
This method is used when LoRA parameters are provided. It randomly
selects a LoRA based on max_loras and retrieves a cached tokenizer for
that LoRA if available. Otherwise, it returns the base tokenizer.
Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of
LoRAs available. If None, LoRA is not used. lora_path
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
is not used.
Returns:
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
element is a LoRARequest (or None if not applicable) and the second
element is the tokenizer associated with the LoRA request (or the
base tokenizer).
"""
if
max_loras
is
None
or
lora_path
is
None
:
return
None
,
tokenizer
# Generate a random LoRA ID in the range [1, max_loras].
lora_id
=
random
.
randint
(
1
,
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
lora_path
),
)
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
# Return lora_request and the cached tokenizer if available; otherwise,
# return the base tokenizer
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
or
tokenizer
@
abstractmethod
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
)
->
list
[
SampleRequest
]:
"""
Abstract method to generate sample requests from the dataset.
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
Args:
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise
NotImplementedError
(
"sample must be implemented in subclasses."
)
def
maybe_oversample_requests
(
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
)
->
None
:
"""
Oversamples the list of requests if its size is less than the desired
number.
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
"""
if
len
(
requests
)
<
num_requests
:
random
.
seed
(
self
.
random_seed
)
additional
=
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
requests
.
extend
(
additional
)
logger
.
info
(
"Oversampled requests to reach %d total samples."
,
num_requests
)
# -----------------------------------------------------------------------------
# Utility Functions and Global Caches
# -----------------------------------------------------------------------------
def
is_valid_sequence
(
prompt_len
:
int
,
output_len
:
int
,
min_len
:
int
=
4
,
max_prompt_len
:
int
=
1024
,
max_total_len
:
int
=
2048
,
skip_min_output_len_check
:
bool
=
False
,
)
->
bool
:
"""
Validate a sequence based on prompt and output lengths.
Default pruning criteria are copied from the original `sample_hf_requests`
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
from `sample_requests` in benchmark_throughput.py.
"""
# Check for invalid conditions
prompt_too_short
=
prompt_len
<
min_len
output_too_short
=
(
not
skip_min_output_len_check
)
and
(
output_len
<
min_len
)
prompt_too_long
=
prompt_len
>
max_prompt_len
combined_too_long
=
(
prompt_len
+
output_len
)
>
max_total_len
# Return True if none of the invalid conditions are met
return
not
(
prompt_too_short
or
output_too_short
or
prompt_too_long
or
combined_too_long
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
# Global cache for LoRA tokenizers.
lora_tokenizer_cache
:
dict
[
int
,
AnyTokenizer
]
=
{}
def
process_image
(
image
:
Any
)
->
Mapping
[
str
,
Any
]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if
isinstance
(
image
,
dict
)
and
'bytes'
in
image
:
image
=
Image
.
open
(
BytesIO
(
image
[
'bytes'
]))
if
isinstance
(
image
,
Image
.
Image
):
image
=
image
.
convert
(
"RGB"
)
with
io
.
BytesIO
()
as
image_data
:
image
.
save
(
image_data
,
format
=
"JPEG"
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
return
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
}
if
isinstance
(
image
,
str
):
image_url
=
(
image
if
image
.
startswith
(
(
"http://"
,
"file://"
))
else
f
"file://
{
image
}
"
)
return
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}}
raise
ValueError
(
f
"Invalid image input
{
image
}
. Must be a PIL.Image.Image"
" or str or dictionary with raw image bytes."
)
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
class
RandomDataset
(
BenchmarkDataset
):
# Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN
=
0
DEFAULT_RANGE_RATIO
=
0.0
DEFAULT_INPUT_LEN
=
1024
DEFAULT_OUTPUT_LEN
=
128
def
__init__
(
self
,
**
kwargs
,
)
->
None
:
super
().
__init__
(
**
kwargs
)
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
prefix_len
:
int
=
DEFAULT_PREFIX_LEN
,
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
# Enforce range_ratio < 1
assert
range_ratio
<
1.0
,
(
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
)
vocab_size
=
tokenizer
.
vocab_size
prefix_token_ids
=
(
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
prefix_len
).
tolist
()
if
prefix_len
>
0
else
[])
# New sampling logic: [X * (1 - b), X * (1 + b)]
input_low
=
int
(
input_len
*
(
1
-
range_ratio
))
input_high
=
int
(
input_len
*
(
1
+
range_ratio
))
output_low
=
int
(
output_len
*
(
1
-
range_ratio
))
output_high
=
int
(
output_len
*
(
1
+
range_ratio
))
# Add logging for debugging
logger
.
info
(
"Sampling input_len from [%s, %s]"
,
input_low
,
input_high
)
logger
.
info
(
"Sampling output_len from [%s, %s]"
,
output_low
,
output_high
)
input_lens
=
np
.
random
.
randint
(
input_low
,
input_high
+
1
,
size
=
num_requests
)
output_lens
=
np
.
random
.
randint
(
output_low
,
output_high
+
1
,
size
=
num_requests
)
offsets
=
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
num_requests
)
requests
=
[]
for
i
in
range
(
num_requests
):
inner_seq
=
((
offsets
[
i
]
+
i
+
np
.
arange
(
input_lens
[
i
]))
%
vocab_size
).
tolist
()
token_sequence
=
prefix_token_ids
+
inner_seq
prompt
=
tokenizer
.
decode
(
token_sequence
)
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
))
return
requests
# -----------------------------------------------------------------------------
# ShareGPT Dataset Implementation
# -----------------------------------------------------------------------------
class
ShareGPTDataset
(
BenchmarkDataset
):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
def
__init__
(
self
,
**
kwargs
)
->
None
:
super
().
__init__
(
**
kwargs
)
self
.
load_data
()
def
load_data
(
self
)
->
None
:
if
self
.
dataset_path
is
None
:
raise
ValueError
(
"dataset_path must be provided for loading data."
)
with
open
(
self
.
dataset_path
,
encoding
=
"utf-8"
)
as
f
:
self
.
data
=
json
.
load
(
f
)
# Filter entries with at least two conversation turns.
self
.
data
=
[
entry
for
entry
in
self
.
data
if
"conversations"
in
entry
and
len
(
entry
[
"conversations"
])
>=
2
]
random
.
seed
(
self
.
random_seed
)
random
.
shuffle
(
self
.
data
)
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
lora_path
:
Optional
[
str
]
=
None
,
max_loras
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
**
kwargs
,
)
->
list
:
samples
:
list
=
[]
for
entry
in
self
.
data
:
if
len
(
samples
)
>=
num_requests
:
break
prompt
,
completion
=
(
entry
[
"conversations"
][
0
][
"value"
],
entry
[
"conversations"
][
1
][
"value"
],
)
lora_request
,
tokenizer
=
self
.
get_random_lora_request
(
tokenizer
=
tokenizer
,
max_loras
=
max_loras
,
lora_path
=
lora_path
)
prompt_ids
=
tokenizer
(
prompt
).
input_ids
completion_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_ids
)
new_output_len
=
(
len
(
completion_ids
)
if
output_len
is
None
else
output_len
)
if
not
is_valid_sequence
(
prompt_len
,
new_output_len
,
skip_min_output_len_check
=
output_len
is
not
None
):
continue
if
enable_multimodal_chat
:
prompt
=
self
.
apply_multimodal_chat_transformation
(
prompt
,
None
)
samples
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
new_output_len
,
lora_request
=
lora_request
,
))
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
return
samples
# -----------------------------------------------------------------------------
# Sonnet Dataset Implementation
# -----------------------------------------------------------------------------
class
SonnetDataset
(
BenchmarkDataset
):
"""
Simplified implementation of the Sonnet dataset. Loads poem lines from a
text file and generates sample requests. Default values here copied from
`benchmark_serving.py` for the sonnet dataset.
"""
DEFAULT_PREFIX_LEN
=
200
DEFAULT_INPUT_LEN
=
550
DEFAULT_OUTPUT_LEN
=
150
def
__init__
(
self
,
**
kwargs
,
)
->
None
:
super
().
__init__
(
**
kwargs
)
self
.
load_data
()
def
load_data
(
self
)
->
None
:
if
not
self
.
dataset_path
:
raise
ValueError
(
"dataset_path must be provided."
)
with
open
(
self
.
dataset_path
,
encoding
=
"utf-8"
)
as
f
:
self
.
data
=
f
.
readlines
()
def
sample
(
self
,
tokenizer
,
num_requests
:
int
,
prefix_len
:
int
=
DEFAULT_PREFIX_LEN
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
return_prompt_formatted
:
bool
=
False
,
**
kwargs
,
)
->
list
:
# Calculate average token length for a poem line.
tokenized_lines
=
[
tokenizer
(
line
).
input_ids
for
line
in
self
.
data
]
avg_len
=
sum
(
len
(
tokens
)
for
tokens
in
tokenized_lines
)
/
len
(
tokenized_lines
)
# Build the base prompt.
base_prompt
=
"Pick as many lines as you can from these poem lines:
\n
"
base_msg
=
[{
"role"
:
"user"
,
"content"
:
base_prompt
}]
base_fmt
=
tokenizer
.
apply_chat_template
(
base_msg
,
add_generation_prompt
=
True
,
tokenize
=
False
)
base_offset
=
len
(
tokenizer
(
base_fmt
).
input_ids
)
if
input_len
<=
base_offset
:
raise
ValueError
(
f
"'input_len' must be higher than the base prompt length "
f
"(
{
base_offset
}
)."
)
# Determine how many poem lines to use.
num_input_lines
=
round
((
input_len
-
base_offset
)
/
avg_len
)
num_prefix_lines
=
max
(
round
((
prefix_len
-
base_offset
)
/
avg_len
),
0
)
prefix_lines
=
self
.
data
[:
num_prefix_lines
]
samples
=
[]
while
len
(
samples
)
<
num_requests
:
extra_lines
=
random
.
choices
(
self
.
data
,
k
=
num_input_lines
-
num_prefix_lines
)
prompt
=
f
"
{
base_prompt
}{
''
.
join
(
prefix_lines
+
extra_lines
)
}
"
msg
=
[{
"role"
:
"user"
,
"content"
:
prompt
}]
prompt_formatted
=
tokenizer
.
apply_chat_template
(
msg
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
if
prompt_len
<=
input_len
:
samples
.
append
(
SampleRequest
(
prompt
=
prompt_formatted
if
return_prompt_formatted
else
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
))
return
samples
# -----------------------------------------------------------------------------
# BurstGPT Dataset Implementation
# -----------------------------------------------------------------------------
class
BurstGPTDataset
(
BenchmarkDataset
):
"""
Implements the BurstGPT dataset. Loads data from a CSV file and generates
sample requests based on synthetic prompt generation. Only rows with Model
"GPT-4" and positive response tokens are used.
"""
def
__init__
(
self
,
**
kwargs
)
->
None
:
super
().
__init__
(
**
kwargs
)
self
.
load_data
()
def
load_data
(
self
,
):
if
self
.
dataset_path
is
None
:
raise
ValueError
(
"dataset_path must be provided for loading data."
)
df
=
pd
.
read_csv
(
self
.
dataset_path
)
# Filter to keep only GPT-4 rows.
gpt4_df
=
df
[
df
[
"Model"
]
==
"GPT-4"
]
# Remove failed requests (where Response tokens is 0 or less).
gpt4_df
=
gpt4_df
[
gpt4_df
[
"Response tokens"
]
>
0
]
# Sample the desired number of rows.
self
.
data
=
gpt4_df
def
_sample_loaded_data
(
self
,
num_requests
:
int
)
->
list
:
if
num_requests
<=
len
(
self
.
data
):
data
=
self
.
data
.
sample
(
n
=
num_requests
,
random_state
=
self
.
random_seed
)
else
:
data
=
self
.
data
.
sample
(
n
=
num_requests
,
random_state
=
self
.
random_seed
,
replace
=
True
,
)
# Convert the dataframe to a list of lists.
return
data
.
values
.
tolist
()
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
max_loras
:
Optional
[
int
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
samples
=
[]
data
=
self
.
_sample_loaded_data
(
num_requests
=
num_requests
)
for
i
in
range
(
num_requests
):
input_len
=
int
(
data
[
i
][
2
])
output_len
=
int
(
data
[
i
][
3
])
lora_req
,
tokenizer
=
self
.
get_random_lora_request
(
tokenizer
=
tokenizer
,
max_loras
=
max_loras
,
lora_path
=
lora_path
)
vocab_size
=
tokenizer
.
vocab_size
# Generate a synthetic prompt: a list of token IDs computed as (i +
# j) modulo vocab_size.
token_ids
=
[(
i
+
j
)
%
vocab_size
for
j
in
range
(
input_len
)]
prompt
=
tokenizer
.
decode
(
token_ids
)
samples
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
input_len
,
expected_output_len
=
output_len
,
lora_request
=
lora_req
,
))
return
samples
# -----------------------------------------------------------------------------
# HuggingFace Dataset Base Implementation
# -----------------------------------------------------------------------------
class
HuggingFaceDataset
(
BenchmarkDataset
):
"""Base class for datasets hosted on HuggingFace."""
SUPPORTED_DATASET_PATHS
:
Union
[
set
[
str
],
dict
[
str
,
Callable
]]
=
set
()
def
__init__
(
self
,
dataset_path
:
str
,
dataset_split
:
str
,
dataset_subset
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
(
dataset_path
=
dataset_path
,
**
kwargs
)
self
.
dataset_split
=
dataset_split
self
.
dataset_subset
=
dataset_subset
self
.
load_data
()
def
load_data
(
self
)
->
None
:
"""Load data from HuggingFace datasets."""
self
.
data
=
load_dataset
(
self
.
dataset_path
,
name
=
self
.
dataset_subset
,
split
=
self
.
dataset_split
,
streaming
=
True
,
)
self
.
data
=
self
.
data
.
shuffle
(
seed
=
self
.
random_seed
)
# -----------------------------------------------------------------------------
# Conversation Dataset Implementation
# -----------------------------------------------------------------------------
class
ConversationDataset
(
HuggingFaceDataset
):
"""Dataset for conversation data with multimodal support."""
SUPPORTED_DATASET_PATHS
=
{
'lmms-lab/LLaVA-OneVision-Data'
,
'Aeala/ShareGPT_Vicuna_unfiltered'
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
**
kwargs
)
->
list
:
# Filter examples with at least 2 conversations
filtered_data
=
self
.
data
.
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
=
[]
dynamic_output
=
output_len
is
None
for
item
in
filtered_data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
conv
=
item
[
"conversations"
]
prompt
,
completion
=
conv
[
0
][
"value"
],
conv
[
1
][
"value"
]
prompt_ids
=
tokenizer
(
prompt
).
input_ids
completion_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_ids
)
completion_len
=
len
(
completion_ids
)
output_len
=
completion_len
if
dynamic_output
else
output_len
assert
isinstance
(
output_len
,
int
)
and
output_len
>
0
if
dynamic_output
and
not
is_valid_sequence
(
prompt_len
,
completion_len
):
continue
mm_content
=
process_image
(
item
[
"image"
])
if
"image"
in
item
else
None
if
enable_multimodal_chat
:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len and output len
prompt
=
self
.
apply_multimodal_chat_transformation
(
prompt
,
mm_content
)
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
# -----------------------------------------------------------------------------
# Vision Arena Dataset Implementation
# -----------------------------------------------------------------------------
class
VisionArenaDataset
(
HuggingFaceDataset
):
"""
Vision Arena Dataset.
"""
DEFAULT_OUTPUT_LEN
=
128
SUPPORTED_DATASET_PATHS
=
{
"lmarena-ai/VisionArena-Chat"
:
lambda
x
:
x
[
"conversation"
][
0
][
0
][
"content"
],
"lmarena-ai/vision-arena-bench-v0.1"
:
lambda
x
:
x
[
"turns"
][
0
][
0
][
"content"
]
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
**
kwargs
,
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
if
parser_fn
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
prompt
=
parser_fn
(
item
)
mm_content
=
process_image
(
item
[
"images"
][
0
])
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
if
enable_multimodal_chat
:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt
=
self
.
apply_multimodal_chat_transformation
(
prompt
,
mm_content
)
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class
InstructCoderDataset
(
HuggingFaceDataset
):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN
=
200
# this is the average default output length
SUPPORTED_DATASET_PATHS
=
{
"likaixin/InstructCoder"
,
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
**
kwargs
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
f
"
{
item
[
'instruction'
]
}
:
\n
{
item
[
'input'
]
}
"
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
class
AIMODataset
(
HuggingFaceDataset
):
"""
Dataset class for processing a AIMO dataset with reasoning questions.
"""
SUPPORTED_DATASET_PATHS
=
{
"AI-MO/aimo-validation-aime"
,
"AI-MO/NuminaMath-1.5"
,
"AI-MO/NuminaMath-CoT"
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
**
kwargs
)
->
list
:
sampled_requests
=
[]
dynamic_output
=
output_len
is
None
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
,
completion
=
item
[
'problem'
],
item
[
"solution"
]
prompt_ids
=
tokenizer
(
prompt
).
input_ids
completion_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_ids
)
completion_len
=
len
(
completion_ids
)
output_len
=
completion_len
if
dynamic_output
else
output_len
assert
isinstance
(
output_len
,
int
)
and
output_len
>
0
if
dynamic_output
and
not
is_valid_sequence
(
prompt_len
,
completion_len
,
max_prompt_len
=
2048
,
max_total_len
=
32000
):
continue
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
None
,
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
r
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
--model <your_model> \
--dataset-name sharegpt \
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
"""
import
argparse
import
asyncio
import
gc
import
json
import
os
import
random
import
time
import
warnings
from
collections.abc
import
AsyncGenerator
,
Iterable
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
Any
,
Optional
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
OPENAI_COMPATIBLE_BACKENDS
,
RequestFuncInput
,
RequestFuncOutput
)
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
try
:
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
except
ImportError
:
from
backend_request_func
import
get_tokenizer
try
:
from
vllm.utils
import
FlexibleArgumentParser
except
ImportError
:
from
argparse
import
ArgumentParser
as
FlexibleArgumentParser
from
benchmark_dataset
import
(
AIMODataset
,
BurstGPTDataset
,
ConversationDataset
,
HuggingFaceDataset
,
InstructCoderDataset
,
RandomDataset
,
SampleRequest
,
ShareGPTDataset
,
SonnetDataset
,
VisionArenaDataset
)
from
benchmark_utils
import
convert_to_pytorch_benchmark_format
,
write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
request_goodput
:
float
output_throughput
:
float
total_token_throughput
:
float
mean_ttft_ms
:
float
median_ttft_ms
:
float
std_ttft_ms
:
float
percentiles_ttft_ms
:
list
[
tuple
[
float
,
float
]]
mean_tpot_ms
:
float
median_tpot_ms
:
float
std_tpot_ms
:
float
percentiles_tpot_ms
:
list
[
tuple
[
float
,
float
]]
mean_itl_ms
:
float
median_itl_ms
:
float
std_itl_ms
:
float
percentiles_itl_ms
:
list
[
tuple
[
float
,
float
]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms
:
float
median_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
list
[
tuple
[
float
,
float
]]
async
def
get_request
(
input_requests
:
list
[
SampleRequest
],
request_rate
:
float
,
burstiness
:
float
=
1.0
,
)
->
AsyncGenerator
[
SampleRequest
,
None
]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
A list of input requests, each represented as a SampleRequest.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests
:
Iterable
[
SampleRequest
]
=
iter
(
input_requests
)
# Calculate scale parameter theta to maintain the desired request_rate.
assert
burstiness
>
0
,
(
f
"A positive burstiness factor is expected, but given
{
burstiness
}
."
)
theta
=
1.0
/
(
request_rate
*
burstiness
)
for
request
in
input_requests
:
yield
request
if
request_rate
==
float
(
"inf"
):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
interval
=
np
.
random
.
gamma
(
shape
=
burstiness
,
scale
=
theta
)
# The next request will be sent after the interval.
await
asyncio
.
sleep
(
interval
)
def
calculate_metrics
(
input_requests
:
list
[
SampleRequest
],
outputs
:
list
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
list
[
str
],
selected_percentiles
:
list
[
float
],
goodput_config_dict
:
dict
[
str
,
float
],
)
->
tuple
[
BenchmarkMetrics
,
list
[
int
]]:
actual_output_lens
:
list
[
int
]
=
[]
total_input
=
0
completed
=
0
good_completed
=
0
itls
:
list
[
float
]
=
[]
tpots
:
list
[
float
]
=
[]
all_tpots
:
list
[
float
]
=
[]
ttfts
:
list
[
float
]
=
[]
e2els
:
list
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
output_len
=
outputs
[
i
].
output_tokens
if
not
output_len
:
# We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be
# bundled together
# Note : this may inflate the output token count slightly
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
).
input_ids
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
].
prompt_len
tpot
=
0
if
output_len
>
1
:
latency_minus_ttft
=
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
tpot
=
latency_minus_ttft
/
(
output_len
-
1
)
tpots
.
append
(
tpot
)
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots
.
append
(
tpot
)
itls
+=
outputs
[
i
].
itl
ttfts
.
append
(
outputs
[
i
].
ttft
)
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
else
:
actual_output_lens
.
append
(
0
)
if
goodput_config_dict
:
valid_metrics
=
[]
slo_values
=
[]
if
"ttft"
in
goodput_config_dict
:
valid_metrics
.
append
(
ttfts
)
slo_values
.
append
(
goodput_config_dict
[
"ttft"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"tpot"
in
goodput_config_dict
:
valid_metrics
.
append
(
all_tpots
)
slo_values
.
append
(
goodput_config_dict
[
"tpot"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"e2el"
in
goodput_config_dict
:
valid_metrics
.
append
(
e2els
)
slo_values
.
append
(
goodput_config_dict
[
"e2el"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
for
req_metric
in
zip
(
*
valid_metrics
):
is_good_req
=
all
([
s
>=
r
for
s
,
r
in
zip
(
slo_values
,
req_metric
)])
if
is_good_req
:
good_completed
+=
1
if
completed
==
0
:
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments."
,
stacklevel
=
2
)
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
total_output
=
sum
(
actual_output_lens
),
request_throughput
=
completed
/
dur_s
,
request_goodput
=
good_completed
/
dur_s
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
total_token_throughput
=
(
total_input
+
sum
(
actual_output_lens
))
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
1000
,
# ttfts is empty if streaming is not supported by backend
std_ttft_ms
=
np
.
std
(
ttfts
or
0
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
or
0
)
*
1000
,
percentiles_ttft_ms
=
[(
p
,
np
.
percentile
(
ttfts
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_tpot_ms
=
np
.
mean
(
tpots
or
0
)
*
1000
,
std_tpot_ms
=
np
.
std
(
tpots
or
0
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
or
0
)
*
1000
,
percentiles_tpot_ms
=
[(
p
,
np
.
percentile
(
tpots
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_itl_ms
=
np
.
mean
(
itls
or
0
)
*
1000
,
std_itl_ms
=
np
.
std
(
itls
or
0
)
*
1000
,
median_itl_ms
=
np
.
median
(
itls
or
0
)
*
1000
,
percentiles_itl_ms
=
[(
p
,
np
.
percentile
(
itls
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_e2el_ms
=
np
.
mean
(
e2els
or
0
)
*
1000
,
std_e2el_ms
=
np
.
std
(
e2els
or
0
)
*
1000
,
median_e2el_ms
=
np
.
median
(
e2els
or
0
)
*
1000
,
percentiles_e2el_ms
=
[(
p
,
np
.
percentile
(
e2els
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
)
return
metrics
,
actual_output_lens
async
def
benchmark
(
backend
:
str
,
api_url
:
str
,
base_url
:
str
,
model_id
:
str
,
model_name
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
list
[
SampleRequest
],
logprobs
:
Optional
[
int
],
request_rate
:
float
,
burstiness
:
float
,
disable_tqdm
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
list
[
str
],
selected_percentiles
:
list
[
float
],
ignore_eos
:
bool
,
goodput_config_dict
:
dict
[
str
,
float
],
max_concurrency
:
Optional
[
int
],
lora_modules
:
Optional
[
Iterable
[
str
]],
extra_body
:
Optional
[
dict
],
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
print
(
"Starting initial single prompt test run..."
)
test_prompt
,
test_prompt_len
,
test_output_len
,
test_mm_content
=
\
input_requests
[
0
].
prompt
,
input_requests
[
0
].
prompt_len
,
\
input_requests
[
0
].
expected_output_len
,
\
input_requests
[
0
].
multi_modal_data
if
backend
!=
"openai-chat"
and
test_mm_content
is
not
None
:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' backend."
)
assert
test_mm_content
is
None
or
isinstance
(
test_mm_content
,
dict
)
test_input
=
RequestFuncInput
(
model
=
model_id
,
model_name
=
model_name
,
prompt
=
test_prompt
,
api_url
=
api_url
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
,
)
test_output
=
await
request_func
(
request_func_input
=
test_input
)
if
not
test_output
.
success
:
raise
ValueError
(
"Initial test run failed - Please make sure benchmark arguments "
f
"are correctly specified. Error:
{
test_output
.
error
}
"
)
else
:
print
(
"Initial test run completed. Starting main benchmark run..."
)
if
lora_modules
:
# For each input request, choose a LoRA module at random.
lora_modules
=
iter
(
[
random
.
choice
(
lora_modules
)
\
for
_
in
range
(
len
(
input_requests
))])
if
profile
:
print
(
"Starting profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
model_name
=
model_name
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/start_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler started"
)
if
burstiness
==
1.0
:
distribution
=
"Poisson process"
else
:
distribution
=
"Gamma distribution"
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
print
(
f
"Burstiness factor:
{
burstiness
}
(
{
distribution
}
)"
)
print
(
f
"Maximum request concurrency:
{
max_concurrency
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore
=
(
asyncio
.
Semaphore
(
max_concurrency
)
if
max_concurrency
else
None
)
async
def
limited_request_func
(
request_func_input
,
pbar
):
if
semaphore
is
None
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
list
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
,
burstiness
):
prompt
,
prompt_len
,
output_len
,
mm_content
=
request
.
prompt
,
\
request
.
prompt_len
,
request
.
expected_output_len
,
\
request
.
multi_modal_data
req_model_id
,
req_model_name
=
model_id
,
model_name
if
lora_modules
:
req_lora_module
=
next
(
lora_modules
)
req_model_id
,
req_model_name
=
req_lora_module
,
req_lora_module
request_func_input
=
RequestFuncInput
(
model
=
req_model_id
,
model_name
=
req_model_name
,
prompt
=
prompt
,
api_url
=
api_url
,
prompt_len
=
prompt_len
,
output_len
=
output_len
,
logprobs
=
logprobs
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
tasks
.
append
(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
list
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
print
(
"Stopping profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/stop_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler stopped"
)
if
pbar
is
not
None
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
,
actual_output_lens
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
selected_percentile_metrics
=
selected_percentile_metrics
,
selected_percentiles
=
selected_percentiles
,
goodput_config_dict
=
goodput_config_dict
,
)
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
if
goodput_config_dict
:
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
metrics
.
request_goodput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
metrics
.
total_token_throughput
))
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_throughput"
:
metrics
.
request_throughput
,
"request_goodput:"
:
metrics
.
request_goodput
if
goodput_config_dict
else
None
,
"output_throughput"
:
metrics
.
output_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
def
process_one_metric
(
# E.g., "ttft"
metric_attribute_name
:
str
,
# E.g., "TTFT"
metric_name
:
str
,
# E.g., "Time to First Token"
metric_header
:
str
,
):
# This function prints and adds statistics of the specified
# metric.
if
metric_attribute_name
not
in
selected_percentile_metrics
:
return
print
(
"{s:{c}^{n}}"
.
format
(
s
=
metric_header
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Mean
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Median
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)))
result
[
f
"mean_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)
result
[
f
"median_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)
result
[
f
"std_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"std_
{
metric_attribute_name
}
_ms"
)
for
p
,
value
in
getattr
(
metrics
,
f
"percentiles_
{
metric_attribute_name
}
_ms"
):
p_word
=
str
(
int
(
p
))
if
int
(
p
)
==
p
else
str
(
p
)
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"P
{
p_word
}
{
metric_name
}
(ms):"
,
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
print
(
"="
*
50
)
return
result
def
check_goodput_args
(
args
):
# Check and parse goodput arguments
goodput_config_dict
=
{}
VALID_NAMES
=
[
"ttft"
,
"tpot"
,
"e2el"
]
if
args
.
goodput
:
goodput_config_dict
=
parse_goodput
(
args
.
goodput
)
for
slo_name
,
slo_val
in
goodput_config_dict
.
items
():
if
slo_name
not
in
VALID_NAMES
:
raise
ValueError
(
f
"Invalid metric name found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective name should be one of "
f
"
{
str
(
VALID_NAMES
)
}
. "
)
if
slo_val
<
0
:
raise
ValueError
(
f
"Invalid value found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective value should be "
"non-negative."
)
return
goodput_config_dict
def
parse_goodput
(
slo_pairs
):
goodput_config_dict
=
{}
try
:
for
slo_pair
in
slo_pairs
:
slo_name
,
slo_val
=
slo_pair
.
split
(
":"
)
goodput_config_dict
[
slo_name
]
=
float
(
slo_val
)
except
ValueError
as
err
:
raise
argparse
.
ArgumentTypeError
(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds."
)
from
err
return
goodput_config_dict
def
save_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
results
:
dict
[
str
,
Any
],
file_name
:
str
)
->
None
:
metrics
=
[
"median_ttft_ms"
,
"mean_ttft_ms"
,
"std_ttft_ms"
,
"p99_ttft_ms"
,
"mean_tpot_ms"
,
"median_tpot_ms"
,
"std_tpot_ms"
,
"p99_tpot_ms"
,
"median_itl_ms"
,
"mean_itl_ms"
,
"std_itl_ms"
,
"p99_itl_ms"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics
=
[
"ttfts"
,
"itls"
,
"generated_texts"
,
"errors"
]
pt_records
=
convert_to_pytorch_benchmark_format
(
args
=
args
,
metrics
=
{
k
:
[
results
[
k
]]
for
k
in
metrics
},
extra_info
=
{
k
:
results
[
k
]
for
k
in
results
if
k
not
in
metrics
and
k
not
in
ignored_metrics
})
if
pt_records
:
# Don't use json suffix here as we don't want CI to pick it up
pt_file
=
f
"
{
os
.
path
.
splitext
(
file_name
)[
0
]
}
.pytorch.json"
write_to_json
(
pt_file
,
pt_records
)
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
backend
=
args
.
backend
model_id
=
args
.
model
model_name
=
args
.
served_model_name
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
args
.
model
tokenizer_mode
=
args
.
tokenizer_mode
if
args
.
base_url
is
not
None
:
api_url
=
f
"
{
args
.
base_url
}{
args
.
endpoint
}
"
base_url
=
f
"
{
args
.
base_url
}
"
else
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
base_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
"
tokenizer
=
get_tokenizer
(
tokenizer_id
,
tokenizer_mode
=
tokenizer_mode
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset_name
is
None
:
raise
ValueError
(
"Please specify '--dataset-name' and the corresponding "
"'--dataset-path' if required."
)
if
args
.
dataset_name
==
"sonnet"
:
dataset
=
SonnetDataset
(
dataset_path
=
args
.
dataset_path
)
# For the "sonnet" dataset, formatting depends on the backend.
if
args
.
backend
==
"openai-chat"
:
input_requests
=
dataset
.
sample
(
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
False
)
else
:
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
"Tokenizer/model must have chat template for sonnet dataset."
)
input_requests
=
dataset
.
sample
(
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
True
)
elif
args
.
dataset_name
==
"hf"
:
# all following datasets are implemented from the
# HuggingFaceDataset base class
if
args
.
dataset_path
in
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_class
=
VisionArenaDataset
args
.
hf_split
=
"train"
args
.
hf_subset
=
None
elif
args
.
dataset_path
in
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_class
=
InstructCoderDataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
ConversationDataset
.
SUPPORTED_DATASET_PATHS
:
dataset_class
=
ConversationDataset
elif
args
.
dataset_path
in
AIMODataset
.
SUPPORTED_DATASET_PATHS
:
dataset_class
=
AIMODataset
args
.
hf_split
=
"train"
else
:
supported_datasets
=
set
([
dataset_name
for
cls
in
HuggingFaceDataset
.
__subclasses__
()
for
dataset_name
in
cls
.
SUPPORTED_DATASET_PATHS
])
raise
ValueError
(
f
"Unsupported dataset path:
{
args
.
dataset_path
}
. "
"Huggingface dataset only supports dataset_path"
f
" from one of following:
{
supported_datasets
}
. "
"Please consider contributing if you would "
"like to add support for additional dataset formats."
)
input_requests
=
dataset_class
(
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
dataset_split
=
args
.
hf_split
,
random_seed
=
args
.
seed
,
).
sample
(
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
output_len
=
args
.
hf_output_len
,
)
else
:
# For datasets that follow a similar structure, use a mapping.
dataset_mapping
=
{
"sharegpt"
:
lambda
:
ShareGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
output_len
=
args
.
sharegpt_output_len
,
),
"burstgpt"
:
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
"random"
:
lambda
:
RandomDataset
(
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
prefix_len
=
args
.
random_prefix_len
,
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
range_ratio
=
args
.
random_range_ratio
,
)
}
try
:
input_requests
=
dataset_mapping
[
args
.
dataset_name
]()
except
KeyError
as
err
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
from
err
goodput_config_dict
=
check_goodput_args
(
args
)
# Collect the sampling parameters.
sampling_params
=
{
k
:
v
for
k
,
v
in
{
"top_p"
:
args
.
top_p
,
"top_k"
:
args
.
top_k
,
"min_p"
:
args
.
min_p
,
"temperature"
:
args
.
temperature
}.
items
()
if
v
is
not
None
}
# Sampling parameters are only supported by openai-compatible backend.
if
sampling_params
and
args
.
backend
not
in
OPENAI_COMPATIBLE_BACKENDS
:
raise
ValueError
(
"Sampling parameters are only supported by openai-compatible "
"backends."
)
if
"temperature"
not
in
sampling_params
:
sampling_params
[
"temperature"
]
=
0.0
# Default to greedy decoding.
# Avoid GC processing "static" data - reduce pause times.
gc
.
collect
()
gc
.
freeze
()
benchmark_result
=
asyncio
.
run
(
benchmark
(
backend
=
backend
,
api_url
=
api_url
,
base_url
=
base_url
,
model_id
=
model_id
,
model_name
=
model_name
,
tokenizer
=
tokenizer
,
input_requests
=
input_requests
,
logprobs
=
args
.
logprobs
,
request_rate
=
args
.
request_rate
,
burstiness
=
args
.
burstiness
,
disable_tqdm
=
args
.
disable_tqdm
,
profile
=
args
.
profile
,
selected_percentile_metrics
=
args
.
percentile_metrics
.
split
(
","
),
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
ignore_eos
=
args
.
ignore_eos
,
goodput_config_dict
=
goodput_config_dict
,
max_concurrency
=
args
.
max_concurrency
,
lora_modules
=
args
.
lora_modules
,
extra_body
=
sampling_params
,
))
# Save config and results to json
if
args
.
save_result
:
result_json
:
dict
[
str
,
Any
]
=
{}
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"backend"
]
=
backend
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Metadata
if
args
.
metadata
:
for
item
in
args
.
metadata
:
if
"="
in
item
:
kvstring
=
item
.
split
(
"="
)
result_json
[
kvstring
[
0
].
strip
()]
=
kvstring
[
1
].
strip
()
else
:
raise
ValueError
(
"Invalid metadata format. Please use KEY=VALUE format."
)
if
not
args
.
save_detailed
:
# Remove fields with too many data points
for
field
in
[
"input_lens"
,
"output_lens"
,
"ttfts"
,
"itls"
,
"generated_texts"
,
"errors"
]:
if
field
in
result_json
:
del
result_json
[
field
]
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"burstiness"
]
=
args
.
burstiness
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
max_concurrency_str
=
(
f
"-concurrency
{
args
.
max_concurrency
}
"
if
args
.
max_concurrency
is
not
None
else
""
)
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_filename
:
file_name
=
args
.
result_filename
if
args
.
result_dir
:
file_name
=
os
.
path
.
join
(
args
.
result_dir
,
file_name
)
with
open
(
file_name
,
"w"
,
encoding
=
'utf-8'
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
save_to_pytorch_benchmark_format
(
args
,
result_json
,
file_name
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
default
=
None
,
help
=
"Server or API base url if not using http host and port."
,
)
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/v1/completions"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"burstgpt"
,
"sonnet"
,
"random"
,
"hf"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--max-concurrency"
,
type
=
int
,
default
=
None
,
help
=
"Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Name of the model."
,
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--logprobs"
,
type
=
int
,
default
=
None
,
help
=
(
"Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times."
,
)
parser
.
add_argument
(
"--burstiness"
,
type
=
float
,
default
=
1.0
,
help
=
"Burstiness factor of the request generation. "
"Only take effect when request_rate is not inf. "
"Default value is 1, which follows Poisson process. "
"Otherwise, the request intervals follow a gamma distribution. "
"A lower burstiness value (0 < burstiness < 1) results in more "
"bursty requests. A higher burstiness value (burstiness > 1) "
"results in a more uniform arrival of requests."
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code from huggingface"
,
)
parser
.
add_argument
(
"--disable-tqdm"
,
action
=
"store_true"
,
help
=
"Specify to disable tqdm progress bar."
,
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler."
,
)
parser
.
add_argument
(
"--save-result"
,
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
parser
.
add_argument
(
"--save-detailed"
,
action
=
"store_true"
,
help
=
"When saving the results, whether to include per request "
"information such as response, error, ttfs, tpots, etc."
,
)
parser
.
add_argument
(
"--metadata"
,
metavar
=
"KEY=VALUE"
,
nargs
=
"*"
,
help
=
"Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
"for metadata of this run to be saved in the result JSON file "
"for record keeping purposes."
,
)
parser
.
add_argument
(
"--result-dir"
,
type
=
str
,
default
=
None
,
help
=
"Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory."
,
)
parser
.
add_argument
(
"--result-filename"
,
type
=
str
,
default
=
None
,
help
=
"Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format."
,
)
parser
.
add_argument
(
"--ignore-eos"
,
action
=
"store_true"
,
help
=
"Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi."
)
parser
.
add_argument
(
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
"Default value is
\"
ttft,tpot,itl
\"
."
)
parser
.
add_argument
(
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
parser
.
add_argument
(
"--goodput"
,
nargs
=
"+"
,
required
=
False
,
help
=
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple
\"
KEY:VALUE
\"
pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
"
\"
ttft
\"
,
\"
tpot
\"
,
\"
e2el
\"
. For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
)
# group for dataset specific arguments
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
sonnet_group
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
sharegpt_group
=
parser
.
add_argument_group
(
"sharegpt dataset options"
)
sharegpt_group
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
random_group
=
parser
.
add_argument_group
(
"random dataset options"
)
random_group
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
0.0
,
help
=
"Range ratio for sampling input/output length, "
"used only for random sampling. Must be in the range [0, 1) to define "
"a symmetric sampling range"
"[length * (1 - range_ratio), length * (1 + range_ratio)]."
,
)
random_group
.
add_argument
(
"--random-prefix-len"
,
type
=
int
,
default
=
0
,
help
=
(
"Number of fixed prefix tokens before the random context "
"in a request. "
"The total input length is the sum of `random-prefix-len` and "
"a random "
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."
),
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
type
=
str
,
default
=
None
,
help
=
"Subset of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-split"
,
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output lengths "
"from the sampled HF dataset."
,
)
sampling_group
=
parser
.
add_argument_group
(
"sampling parameters"
)
sampling_group
.
add_argument
(
"--top-p"
,
type
=
float
,
default
=
None
,
help
=
"Top-p sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--top-k"
,
type
=
int
,
default
=
None
,
help
=
"Top-k sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--min-p"
,
type
=
float
,
default
=
None
,
help
=
"Min-p sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--temperature"
,
type
=
float
,
default
=
None
,
help
=
"Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0)."
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
"auto"
,
choices
=
[
'auto'
,
'slow'
,
'mistral'
,
'custom'
],
help
=
'The tokenizer mode.
\n\n
* "auto" will use the '
'fast tokenizer if available.
\n
* "slow" will '
'always use the slow tokenizer.
\n
* '
'"mistral" will always use the `mistral_common` tokenizer.
\n
*'
'"custom" will use --tokenizer to select the preregistered tokenizer.'
)
parser
.
add_argument
(
"--served-model-name"
,
type
=
str
,
default
=
None
,
help
=
"The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. "
)
parser
.
add_argument
(
"--lora-modules"
,
nargs
=
'+'
,
default
=
None
,
help
=
"A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random."
)
args
=
parser
.
parse_args
()
main
(
args
)
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
json
import
math
import
os
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
dict
[
str
,
list
],
extra_info
:
dict
[
str
,
Any
])
->
list
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records
=
[]
if
not
os
.
environ
.
get
(
"SAVE_TO_PYTORCH_BENCHMARK_FORMAT"
,
False
):
return
records
for
name
,
benchmark_values
in
metrics
.
items
():
record
=
{
"benchmark"
:
{
"name"
:
"vLLM benchmark"
,
"extra_info"
:
{
"args"
:
vars
(
args
),
},
},
"model"
:
{
"name"
:
args
.
model
,
},
"metric"
:
{
"name"
:
name
,
"benchmark_values"
:
benchmark_values
,
"extra_info"
:
extra_info
,
},
}
tp
=
record
[
"benchmark"
][
"extra_info"
][
"args"
].
get
(
"tensor_parallel_size"
)
# Save tensor_parallel_size parameter if it's part of the metadata
if
not
tp
and
"tensor_parallel_size"
in
extra_info
:
record
[
"benchmark"
][
"extra_info"
][
"args"
][
"tensor_parallel_size"
]
=
extra_info
[
"tensor_parallel_size"
]
records
.
append
(
record
)
return
records
class
InfEncoder
(
json
.
JSONEncoder
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
return
"inf"
return
o
def
iterencode
(
self
,
o
:
Any
,
*
args
,
**
kwargs
)
->
Any
:
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
)
3_env_check-batch_onlinetests/scripts/entrypoint.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
# 运行环境检查
echo
"==================== 开始系统环境检查 ===================="
#/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
# 初始化目录
mkdir
-p
/workspace/test/inference_outputs/results
mkdir
-p
/workspace/test/inference_outputs/logs/server
mkdir
-p
/workspace/test/inference_outputs/logs/models
# 基础端口
BASE_PORT
=
8001
# 读取配置文件(分号分隔)
while
IFS
=
';'
read
-r
model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util
;
do
# 清理参数(去除空格和引号)
model_name
=
$(
echo
"
$model_name
"
| xargs
)
model_path
=
$(
echo
"
$model_path
"
| xargs
)
tp
=
$(
echo
"
$tp
"
| xargs
)
data_type
=
$(
echo
"
$data_type
"
| xargs
)
batch_list
=
$(
echo
"
$batch_list
"
|
tr
-d
'"'
| xargs
)
prompt_pairs
=
$(
echo
"
$prompt_pairs
"
|
tr
-d
'()"'
| xargs
)
max_model_len
=
$(
echo
"
$max_model_len
"
| xargs
)
gpu_mem_util
=
$(
echo
"
$gpu_mem_util
"
| xargs
)
# 动态分配端口
port
=
$((
BASE_PORT++
))
# 生成 server.sh
cat
>
"/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
<<
EOF
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
vllm serve "
$model_path
" --trust-remote-code
\\
--enable-prefix-caching
\\
--dtype
$data_type
\\
--tensor-parallel-size
$tp
\\
--max-model-len
$max_model_len
\\
--port
$port
\\
--gpu-memory-utilization
$gpu_mem_util
EOF
# 赋予执行权限
chmod
+x
"/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
echo
"Generated server script for
${
model_name
}
_tp
${
tp
}
at /workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
# 1. 启动 vLLM 服务,并记录日志到 server.log
/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh
>
"/workspace/test/inference_outputs/logs/server/
${
model_name
}
_tp
${
tp
}
_server.log"
2>&1 &
SERVER_PID
=
$!
# 2. 改进的日志检测函数
check_server_status
()
{
local
log_file
=
$1
local
server_pid
=
$2
local
success_msg
=
"Starting vLLM API server on http://0.0.0.0"
local
error_patterns
=(
"RuntimeError"
"ValueError"
"segmentation fault"
"core dumped"
)
# 检查成功消息
if
grep
-q
"
$success_msg
"
"
$log_file
"
;
then
echo
"✅ Server started successfully!"
return
0
fi
# 检查错误消息
for
pattern
in
"
${
error_patterns
[@]
}
"
;
do
if
grep
-i
-q
"
$pattern
"
"
$log_file
"
;
then
echo
-e
"
\n
❌ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Error detected in logs (pattern:
$pattern
)!"
echo
"===== ERROR CONTEXT ====="
grep
-i
-A
5
-B
5
"
$pattern
"
"
$log_file
"
|
tail
-n
20
echo
"========================="
return
1
fi
done
# 检查进程是否存活
if
!
kill
-0
$server_pid
2>/dev/null
;
then
echo
-e
"
\n
❌ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Server process died unexpectedly!"
echo
"===== LAST LOG LINES ====="
tail
-n
20
"
$log_file
"
echo
"========================="
return
1
fi
# 默认返回继续等待
return
2
}
# 3. 等待服务器启动或失败
echo
-e
"
\n
🔍 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Starting monitoring for
${
model_name
}
_tp
${
tp
}
(PID:
$SERVER_PID
)"
max_wait_seconds
=
20000
start_time
=
$(
date
+%s
)
log_file
=
"/workspace/test/inference_outputs/logs/server/
${
model_name
}
_tp
${
tp
}
_server.log"
while
true
;
do
sleep
20
# 每20秒检查一次
check_server_status
"
$log_file
"
"
$SERVER_PID
"
status
=
$?
# 成功状态
if
[
$status
-eq
0
]
;
then
break
fi
# 失败状态
if
[
$status
-eq
1
]
;
then
# 清理资源
kill
$SERVER_PID
2>/dev/null
pkill
-f
"vllm serve"
2>/dev/null
echo
"🛑 Cleaned up resources after failure"
# 直接继续下一个模型测试
continue
2
fi
# 检查超时
current_time
=
$(
date
+%s
)
elapsed
=
$((
current_time
-
start_time
))
if
[
$elapsed
-ge
$max_wait_seconds
]
;
then
echo
-e
"
\n
⏰ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Timeout waiting for server to start!"
# 清理资源
kill
$SERVER_PID
2>/dev/null
pkill
-f
"vllm serve"
2>/dev/null
echo
"🛑 Cleaned up resources after timeout"
# 直接继续下一个模型测试
continue
2
fi
echo
"Waiting... (
${
elapsed
}
s elapsed)"
done
# 4. 只有成功启动时才执行测试
echo
-e
"
\n
🚀 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Running tests for
${
model_name
}
_tp
${
tp
}
..."
# 设置测试环境变量
export
MODEL_NAME
=
"
$model_name
"
export
MODEL_PATH
=
"
$model_path
"
export
TP
=
"
$tp
"
export
DATA_TYPE
=
"
$data_type
"
export
BATCH_LIST
=
"
$batch_list
"
export
PROMPT_PAIRS
=
"
$prompt_pairs
"
export
PORT
=
"
$port
"
# 运行测试
./test.sh
# 5. 测试完成后清理
kill
$SERVER_PID
pkill
-f
"vllm serve"
2>/dev/null
echo
"✅ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
]
${
model_name
}
_tp
${
tp
}
test completed and cleaned up"
done
< <
(
grep
-v
'^#'
../configs/model_to_test.cfg |
grep
-v
'^$'
)
echo
-e
"
\n
📊 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] All tests completed. Results saved to results/"
\ No newline at end of file
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs/"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 增强版检查函数 - 遇到错误继续执行
run_test
()
{
local
name
=
$1
local
chinese_name
=
$2
shift
2
echo
"[RUN]
$chinese_name
"
if
!
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
chinese_name
}
.log"
;
then
echo
"[WARN]
$chinese_name
检查失败"
|
tee
-a
"
$log_dir
/
${
chinese_name
}
.log"
return
1
# 返回非零状态但不退出脚本
fi
return
0
}
run_pipe_test
()
{
local
name
=
$1
local
chinese_name
=
$2
local
cmd
=
$3
echo
"[RUN]
$chinese_name
"
if
!
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
chinese_name
}
.log"
;
then
echo
"[WARN]
$chinese_name
检查失败"
|
tee
-a
"
$log_dir
/
${
chinese_name
}
.log"
return
1
fi
return
0
}
# 安全执行函数 - 确保即使命令失败也不会中断脚本
safe_run
()
{
local
section
=
$1
shift
echo
"====================
$section
===================="
for
cmd
in
"
$@
"
;
do
# 使用eval来正确处理带引号的命令
if
!
eval
"
$cmd
"
;
then
echo
"[WARN] 命令执行失败:
$cmd
"
|
tee
-a
"
$log_dir
/error.log"
fi
done
}
# ------------------------- 1. 系统基础检查 -------------------------
safe_run
"1.系统基础信息检查"
\
'run_test uname "01_系统内核信息" uname -a'
\
'run_test os_release "02_操作系统版本" cat /etc/os-release'
\
'run_test locale "03_系统语言环境" locale'
# ------------------------- 2. CPU & 内存检查 -------------------------
safe_run
"2.CPU_内存检查"
\
'run_test cpu_info "04_CPU详细信息" lscpu'
\
'run_test cpu_cores "05_CPU核心数" nproc'
\
'run_pipe_test cpu_freq "06_CPU频率" "cat /proc/cpuinfo | grep \"MHz\" | sort -u"'
\
'run_test memory_usage "07_内存使用情况" free -h'
\
'run_test vm_stat "08_系统整体CPU和内存使用情况" vmstat 1 10'
\
'run_test numa_nodes "09_NUMA节点信息" numactl --hardware || true'
\
'run_pipe_test cpu_usage "10_CPU利用率检查" "mpstat -P ALL 1 5"'
\
'run_pipe_test cpu_top_usage "11_CPU占用最高进程检查" "ps -eo pid,%cpu,cmd --sort=-%cpu | head -n 10"'
# ------------------------- 3. 存储设备检查 -------------------------
safe_run
"3.存储设备检查"
\
'run_test disk_usage "12_磁盘使用情况" df -hT'
\
'run_test mount_info "13_挂载信息" mount | column -t'
\
'run_test block_devices "14_块设备信息" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT'
# ------------------------- 4. 网络检查 -------------------------
safe_run
"4.网络检查"
\
'run_test netstat "15_网络连接状态" ss -tulnp'
\
'run_test network_interfaces "16_网络接口信息" ip -br a'
\
'run_test routing_table "17_路由表信息" ip route'
\
'run_test arp_table "18_ARP表信息" ip neigh'
\
'run_test ibdev2netdev "19_InfiniBand设备映射" ibdev2netdev'
\
'run_test topo "20_网卡-dcu-topo" lspci -vt '
# ------------------------- 5. DCU&内核&驱动检查 -------------------------
safe_run
"5.DCU_内核_驱动检查"
\
'run_test hy_smi "21_DCU设备状态" hy-smi'
\
'run_test clock_level "22_DCU时钟级别" hy-smi -g'
\
'run_test driverversion "23_DCU驱动版本" hy-smi --showdriverversion'
\
'run_test rocminfo "24_ROCM信息" rocminfo'
\
'run_test kernel_modules "25_已加载内核模块" lsmod'
\
'run_test kernel_version "26_内核版本" uname -r'
# ------------------------- 6. 软件栈检查 -------------------------
safe_run
"6.软件栈检查"
\
'run_test pip_list "27_Python包列表" pip list'
\
'run_test glibc_version "28_GLIBC版本" ldd --version'
# ------------------------- 7. 其他硬件状态检查 -------------------------
safe_run
"7.其他硬件状态检查"
\
'run_test lspci "29_PCI设备列表" lspci'
\
'run_test iostat "30_IO统计信息" iostat'
\
'run_test hardware_info "31_硬件摘要信息" lshw -short || true'
\
'run_pipe_test ACS_stat "32_ACS状态检查" "lspci -vvv | grep -i acsct"'
\
'run_test dmesg "33_内核日志" dmesg'
\
'run_pipe_test pcie_topology "34_PCIe拓扑结构" "echo \"====== PCIe 桥接器 ======\"; lspci -vvv | grep -E \"PCI bridge|Root port\" -A 20 | grep -E \"Device|Vendor|LnkSta:|LnkCap:|Secondary bus\"; echo \"\"; echo \"====== PCIe 带宽汇总 ======\"; lspci -vvv | grep \"LnkSta:\" | sort | uniq -c"'
\
'run_pipe_test storage_details "35_存储控制器详情" "echo \"====== 存储控制器 ======\"; lspci -vvv | grep -E \"NVMe|SATA|RAID|Storage controller\" -A 30 | grep -E \"Device|Vendor|Kernel driver|LnkSta:|Speed|Width|MSI|Bar Memory\""'
\
'run_pipe_test nic_details "36_网卡详细信息" "echo \"====== 网卡详细信息 ======\"; lspci -vvv | grep -E \"Ethernet controller|Network controller|InfiniBand\" -A 50 | grep -E \"Device|Vendor|Subsystem|Kernel driver|Kernel modules|LnkSta:|LnkCap:|NUMA node|Speed|Width\""'
\
'run_pipe_test iommu_stat "37_IOMMU状态" "dmesg | grep IOMMU"'
\
'run_pipe_test SELinux_stat "38_SELinux状态" "dmesg | grep SELinux"'
# ------------------------- 8. 带宽检查 -------------------------
source
/opt/dtk/env.sh
safe_run
"8.带宽检查"
\
'run_test D2D-a_test "39_D2D单向带宽测试" /opt/dtk/bin/BandwidthTest -a -s 512MB '
\
'run_test D2D-A_test "40_D2D双向带宽测试" /opt/dtk/bin/BandwidthTest -A -s 512MB '
\
'run_test D2H-H2D_test "41_D2H和H2D带宽测试" /opt/dtk/bin/BandwidthTest -t 3 '
\
'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }'
\
'if [ -f "rccl-tests.zip" ]; then
echo "[INFO] 发现 rccl-tests.zip,开始解压..."
unzip -o rccl-tests.zip -d rccl-tests || {
echo "[ERROR] rccl-tests.zip 解压失败" | tee "$log_dir/42_RCCL测试解压失败.log"
exit 1
}
cd rccl-tests/rccl-tests || { echo "[ERROR] 无法进入rccl-tests目录"; exit 1; }
if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/43_RCCL_all_reduce_8卡测试.log" || true
./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/44_RCCL_all_reduce_4卡测试.log" || true
else
echo "[ERROR] RCCL编译失败" | tee "$log_dir/45_RCCL编译失败.log"
fi
cd ../..
else
echo "[WARN] 未找到 rccl-tests.zip,跳过 RCCL 测试" | tee "$log_dir/46_RCCL测试跳过.log"
fi'
# ------------------------- 9.DCU环境检查 -------------------------
safe_run
"9.DCU环境检查"
\
'cd /workspace/test/env_check_tools/ || { echo "[ERROR] 无法进入/workspace/test/env_check_tools/目录"; exit 1; }'
\
'if [ -f "dcu_env_check.zip" ]; then
echo "[INFO] 发现 dcu_env_check.zip,开始解压..."
unzip -o dcu_env_check.zip -d dcu_env_check || {
echo "[ERROR] dcu_env_check.zip 解压失败" | tee "$log_dir/47_DCU环境检查解压失败.log"
exit 1
}
chmod +x dcu_env_check/dcu_env_check-main/tools/*
cd dcu_env_check/dcu_env_check-main && {
bash system_check.sh 2>&1 | tee "$log_dir/48_DCU环境检查结果.log" || true
cp system_info* /workspace/test/env_check_outputs/ || true
cd ../..
} || {
echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/49_DCU环境检查执行失败.log"
}
else
echo "[WARN] 未找到 dcu_env_check.zip,跳过 DCU 环境检查" | tee "$log_dir/50_DCU环境检查跳过.log"
fi'
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
3_env_check-batch_onlinetests/scripts/test.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
=
1
export
VLLM_NUMA_BIND
=
1
export
VLLM_RANK0_NUMA
=
0
export
VLLM_RANK1_NUMA
=
1
export
VLLM_RANK2_NUMA
=
2
export
VLLM_RANK3_NUMA
=
3
export
VLLM_RANK4_NUMA
=
4
export
VLLM_RANK5_NUMA
=
5
export
VLLM_RANK6_NUMA
=
6
export
VLLM_RANK7_NUMA
=
7
# 从环境变量读取参数
model_name
=
${
MODEL_NAME
}
model_path
=
${
MODEL_PATH
}
tp
=
${
TP
}
data_type
=
${
DATA_TYPE
}
batch_list
=
${
BATCH_LIST
}
prompt_pairs
=
${
PROMPT_PAIRS
}
port
=
${
PORT
}
# 生成结果文件名
result_file
=
"/workspace/test/inference_outputs/results/
${
model_name
}
_tp
${
tp
}
.csv"
echo
"tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s),TTFT(ms),TPOT(ms),ITL(ms)"
>
"
$result_file
"
# 转换字符串为数组
IFS
=
' '
read
-ra
batches
<<<
"
$batch_list
"
IFS
=
','
read
-ra
pairs
<<<
"
$prompt_pairs
"
# 执行测试
for
batch
in
"
${
batches
[@]
}
"
;
do
for
pair
in
"
${
pairs
[@]
}
"
;
do
IFS
=
' '
read
-r
prompt_tokens completion_tokens
<<<
"
$pair
"
log_file
=
"/workspace/test/inference_outputs/logs/models/
${
model_name
}
_
${
tp
}
/batch_
${
batch
}
_prompt_
${
prompt_tokens
}
_completion_
${
completion_tokens
}
.log"
mkdir
-p
"
$(
dirname
"
$log_file
"
)
"
echo
"Running: batch=
$batch
, prompt=
$prompt_tokens
, completion=
$completion_tokens
"
python benchmark_serving.py
\
--backend
openai
\
--port
"
$port
"
\
--model
"
$model_path
"
\
--trust-remote-code
\
--dataset-name
random
\
--ignore-eos
\
--random-input-len
"
$prompt_tokens
"
\
--random-output-len
"
$completion_tokens
"
\
--num-prompts
"
$batch
"
\
2>&1 |
tee
"
$log_file
"
# 提取指标
TOTAL_THROUGHPUT
=
$(
grep
"^Total Token"
"
$log_file
"
|
awk
'{print $5}'
)
GEN_THROUGHPUT
=
$(
grep
"^Output token"
"
$log_file
"
|
awk
'{print $5}'
)
TTFT
=
$(
grep
"^Mean TTFT"
"
$log_file
"
|
awk
'{print $4}'
)
TPOT
=
$(
grep
"^Mean TPOT"
"
$log_file
"
|
awk
'{print $4}'
)
ITL
=
$(
grep
"^Mean ITL"
"
$log_file
"
|
awk
'{print $4}'
)
echo
"
$tp
,
$data_type
,
$batch
,
$prompt_tokens
,
$completion_tokens
,
$TOTAL_THROUGHPUT
,
$GEN_THROUGHPUT
,
$TTFT
,
$TPOT
,
$ITL
"
>>
"
$result_file
"
done
done
3_env_check-batch_onlinetests/start.sh
0 → 100644
View file @
f6a338d7
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
/public/opendas/DL_DATA/llm-models:/workspace/llms/:ro
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--network
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment