Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
easystart_v0.2
Commits
f6a338d7
Commit
f6a338d7
authored
Jul 16, 2025
by
jerrrrry
Browse files
Initial commit
parents
Changes
33
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
905 additions
and
0 deletions
+905
-0
3_env_check-batch_onlinetests/Dockerfile
3_env_check-batch_onlinetests/Dockerfile
+42
-0
3_env_check-batch_onlinetests/configs/model_to_test.cfg
3_env_check-batch_onlinetests/configs/model_to_test.cfg
+26
-0
3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
...check-batch_onlinetests/env_check_tools/dcu_env_check.zip
+0
-0
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
+0
-0
3_env_check-batch_onlinetests/scripts/backend_request_func.py
...v_check-batch_onlinetests/scripts/backend_request_func.py
+505
-0
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
+0
-0
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
+0
-0
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
+69
-0
3_env_check-batch_onlinetests/scripts/entrypoint.sh
3_env_check-batch_onlinetests/scripts/entrypoint.sh
+12
-0
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
+166
-0
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
+0
-0
3_env_check-batch_onlinetests/scripts/test.sh
3_env_check-batch_onlinetests/scripts/test.sh
+64
-0
3_env_check-batch_onlinetests/start.sh
3_env_check-batch_onlinetests/start.sh
+21
-0
No files found.
3_env_check-batch_onlinetests/Dockerfile
0 → 100644
View file @
f6a338d7
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/vllm:0.8.5-ubuntu22.04-dtk25.04.1-rc5-das1.6-py3.10-20250711
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
sysstat
\
locate
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
&&
\
mkdir
-p
/workspace/test/models
&&
\
mkdir
-p
/workspace/test/env_check_tools
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
COPY
./env_check_tools/dcu_env_check.zip /workspace/test/env_check_tools/
COPY
./env_check_tools/rccl-tests.zip /workspace/test/env_check_tools/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
3_env_check-batch_onlinetests/configs/model_to_test.cfg
0 → 100644
View file @
f6a338d7
Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;4;float16;"1 ";(512 512);32768;0.95
Qwen2.5-VL-7B;/workspace/llms/qwen2.5/Qwen2.5-VL-7B-Instruct/;1;float16;"1 ";(512 512);32768;0.95
#Qwen2.5-VL-32B;/workspace/llms/qwen2.5/Qwen2.5-VL-32B-Instruct/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms/qwen3/Qwen3-32B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-32B;/workspace/llms//qwen3/Qwen3-32B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-30B-A3B;/workspace/llms/qwen3/Qwen3-30B-A3B/;4;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-4B;/workspace/llms/qwen3/Qwen3-4B/;1;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024,20480 1024);32768;0.95
#Qwen3-235B-A22B;/workspace/llms/qwen3/Qwen3-235B-A22B/;8;float16;"1 4 8 12 16 20 24 28 32 ";(512 512,512 1024,1024 1024,2048 1024,3072 1024,4096 1024,8192 1024,10240 1024,16384 1024);20000;0.95
3_env_check-batch_onlinetests/env_check_tools/dcu_env_check.zip
0 → 100644
View file @
f6a338d7
File added
3_env_check-batch_onlinetests/env_check_tools/rccl-tests.zip
0 → 100644
View file @
f6a338d7
File added
3_env_check-batch_onlinetests/scripts/backend_request_func.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
import
json
import
os
import
sys
import
time
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
,
Union
import
aiohttp
import
huggingface_hub.constants
from
tqdm.asyncio
import
tqdm
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
model_name
:
Optional
[
str
]
=
None
logprobs
:
Optional
[
int
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0.0
output_tokens
:
int
=
0
ttft
:
float
=
0.0
# Time to first token
itl
:
list
[
float
]
=
field
(
default_factory
=
list
)
# list of inter-token latencies
tpot
:
float
=
0.0
# avg next-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
params
=
{
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
"truncate"
:
request_func_input
.
prompt_len
,
"ignore_eos_token"
:
request_func_input
.
ignore_eos
,
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
if
request_func_input
.
ignore_eos
:
output
.
output_tokens
=
request_func_input
.
output_len
else
:
output
.
output_tokens
=
None
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if
chunk_bytes
.
startswith
(
":"
):
continue
chunk
=
chunk_bytes
.
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
output
.
generated_text
=
data
[
"generated_text"
]
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
if
request_func_input
.
ignore_eos
:
payload
[
"min_length"
]
=
request_func_input
.
output_len
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
output
.
generated_text
+=
data
[
"text_output"
]
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"max_tokens"
:
request_func_input
.
output_len
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temp.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
parsed_resp
=
await
response
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
if
"choices"
in
parsed_resp
:
output
.
generated_text
=
parsed_resp
[
"choices"
][
0
][
"text"
]
elif
"text"
in
parsed_resp
:
output
.
generated_text
=
parsed_resp
[
"text"
][
0
]
else
:
output
.
error
=
(
"Unexpected response format: "
"neither 'choices' nor 'text' found"
)
output
.
success
=
False
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"completions"
,
"profile"
)
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"logprobs"
:
request_func_input
.
logprobs
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
first_chunk_received
=
False
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
data
=
json
.
loads
(
chunk
)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if
choices
:
=
data
.
get
(
"choices"
):
# Note that text could be empty here
# e.g. for special tokens
text
=
choices
[
0
].
get
(
"text"
)
timestamp
=
time
.
perf_counter
()
# First token
if
not
first_chunk_received
:
first_chunk_received
=
True
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
generated_text
+=
text
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
if
first_chunk_received
:
output
.
success
=
True
else
:
output
.
success
=
False
output
.
error
=
(
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!"
)
output
.
generated_text
=
generated_text
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_chat_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"chat/completions"
,
"profile"
)
),
"OpenAI Chat Completions API URL must end with 'chat/completions'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
if
request_func_input
.
multi_modal_content
:
content
.
append
(
request_func_input
.
multi_modal_content
)
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
content
},
],
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
from
vllm.model_executor.model_loader.weight_utils
import
get_lock
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with
get_lock
(
pretrained_model_name_or_path
):
model_path
=
snapshot_download
(
model_id
=
pretrained_model_name_or_path
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
])
return
model_path
return
pretrained_model_name_or_path
def
get_tokenizer
(
pretrained_model_name_or_path
:
str
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
if
pretrained_model_name_or_path
is
not
None
and
not
os
.
path
.
exists
(
pretrained_model_name_or_path
):
pretrained_model_name_or_path
=
get_model
(
pretrained_model_name_or_path
)
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
kwargs
[
"use_fast"
]
=
False
if
tokenizer_mode
==
"mistral"
:
try
:
from
vllm.transformers_utils.tokenizer
import
MistralTokenizer
except
ImportError
as
e
:
raise
ImportError
(
"MistralTokenizer requires vllm package.
\n
"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode."
)
from
e
return
MistralTokenizer
.
from_pretrained
(
str
(
pretrained_model_name_or_path
))
else
:
return
AutoTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_openai_completions
,
"lmdeploy"
:
async_request_openai_completions
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
3_env_check-batch_onlinetests/scripts/benchmark_dataset.py
0 → 100644
View file @
f6a338d7
This diff is collapsed.
Click to expand it.
3_env_check-batch_onlinetests/scripts/benchmark_serving.py
0 → 100644
View file @
f6a338d7
This diff is collapsed.
Click to expand it.
3_env_check-batch_onlinetests/scripts/benchmark_utils.py
0 → 100644
View file @
f6a338d7
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
json
import
math
import
os
from
typing
import
Any
def
convert_to_pytorch_benchmark_format
(
args
:
argparse
.
Namespace
,
metrics
:
dict
[
str
,
list
],
extra_info
:
dict
[
str
,
Any
])
->
list
:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records
=
[]
if
not
os
.
environ
.
get
(
"SAVE_TO_PYTORCH_BENCHMARK_FORMAT"
,
False
):
return
records
for
name
,
benchmark_values
in
metrics
.
items
():
record
=
{
"benchmark"
:
{
"name"
:
"vLLM benchmark"
,
"extra_info"
:
{
"args"
:
vars
(
args
),
},
},
"model"
:
{
"name"
:
args
.
model
,
},
"metric"
:
{
"name"
:
name
,
"benchmark_values"
:
benchmark_values
,
"extra_info"
:
extra_info
,
},
}
tp
=
record
[
"benchmark"
][
"extra_info"
][
"args"
].
get
(
"tensor_parallel_size"
)
# Save tensor_parallel_size parameter if it's part of the metadata
if
not
tp
and
"tensor_parallel_size"
in
extra_info
:
record
[
"benchmark"
][
"extra_info"
][
"args"
][
"tensor_parallel_size"
]
=
extra_info
[
"tensor_parallel_size"
]
records
.
append
(
record
)
return
records
class
InfEncoder
(
json
.
JSONEncoder
):
def
clear_inf
(
self
,
o
:
Any
):
if
isinstance
(
o
,
dict
):
return
{
k
:
self
.
clear_inf
(
v
)
for
k
,
v
in
o
.
items
()}
elif
isinstance
(
o
,
list
):
return
[
self
.
clear_inf
(
v
)
for
v
in
o
]
elif
isinstance
(
o
,
float
)
and
math
.
isinf
(
o
):
return
"inf"
return
o
def
iterencode
(
self
,
o
:
Any
,
*
args
,
**
kwargs
)
->
Any
:
return
super
().
iterencode
(
self
.
clear_inf
(
o
),
*
args
,
**
kwargs
)
def
write_to_json
(
filename
:
str
,
records
:
list
)
->
None
:
with
open
(
filename
,
"w"
)
as
f
:
json
.
dump
(
records
,
f
,
cls
=
InfEncoder
)
3_env_check-batch_onlinetests/scripts/entrypoint.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
# 运行环境检查
echo
"==================== 开始系统环境检查 ===================="
#/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
3_env_check-batch_onlinetests/scripts/run_benchmark.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
# 初始化目录
mkdir
-p
/workspace/test/inference_outputs/results
mkdir
-p
/workspace/test/inference_outputs/logs/server
mkdir
-p
/workspace/test/inference_outputs/logs/models
# 基础端口
BASE_PORT
=
8001
# 读取配置文件(分号分隔)
while
IFS
=
';'
read
-r
model_name model_path tp data_type batch_list prompt_pairs max_model_len gpu_mem_util
;
do
# 清理参数(去除空格和引号)
model_name
=
$(
echo
"
$model_name
"
| xargs
)
model_path
=
$(
echo
"
$model_path
"
| xargs
)
tp
=
$(
echo
"
$tp
"
| xargs
)
data_type
=
$(
echo
"
$data_type
"
| xargs
)
batch_list
=
$(
echo
"
$batch_list
"
|
tr
-d
'"'
| xargs
)
prompt_pairs
=
$(
echo
"
$prompt_pairs
"
|
tr
-d
'()"'
| xargs
)
max_model_len
=
$(
echo
"
$max_model_len
"
| xargs
)
gpu_mem_util
=
$(
echo
"
$gpu_mem_util
"
| xargs
)
# 动态分配端口
port
=
$((
BASE_PORT++
))
# 生成 server.sh
cat
>
"/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
<<
EOF
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
export VLLM_NUMA_BIND=1
export VLLM_RANK0_NUMA=0
export VLLM_RANK1_NUMA=1
export VLLM_RANK2_NUMA=2
export VLLM_RANK3_NUMA=3
export VLLM_RANK4_NUMA=4
export VLLM_RANK5_NUMA=5
export VLLM_RANK6_NUMA=6
export VLLM_RANK7_NUMA=7
export VLLM_USE_TRITON_PREFIX_FLASH_ATTN=1
vllm serve "
$model_path
" --trust-remote-code
\\
--enable-prefix-caching
\\
--dtype
$data_type
\\
--tensor-parallel-size
$tp
\\
--max-model-len
$max_model_len
\\
--port
$port
\\
--gpu-memory-utilization
$gpu_mem_util
EOF
# 赋予执行权限
chmod
+x
"/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
echo
"Generated server script for
${
model_name
}
_tp
${
tp
}
at /workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh"
# 1. 启动 vLLM 服务,并记录日志到 server.log
/workspace/test/inference_outputs/server_
${
model_name
}
_tp
${
tp
}
.sh
>
"/workspace/test/inference_outputs/logs/server/
${
model_name
}
_tp
${
tp
}
_server.log"
2>&1 &
SERVER_PID
=
$!
# 2. 改进的日志检测函数
check_server_status
()
{
local
log_file
=
$1
local
server_pid
=
$2
local
success_msg
=
"Starting vLLM API server on http://0.0.0.0"
local
error_patterns
=(
"RuntimeError"
"ValueError"
"segmentation fault"
"core dumped"
)
# 检查成功消息
if
grep
-q
"
$success_msg
"
"
$log_file
"
;
then
echo
"✅ Server started successfully!"
return
0
fi
# 检查错误消息
for
pattern
in
"
${
error_patterns
[@]
}
"
;
do
if
grep
-i
-q
"
$pattern
"
"
$log_file
"
;
then
echo
-e
"
\n
❌ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Error detected in logs (pattern:
$pattern
)!"
echo
"===== ERROR CONTEXT ====="
grep
-i
-A
5
-B
5
"
$pattern
"
"
$log_file
"
|
tail
-n
20
echo
"========================="
return
1
fi
done
# 检查进程是否存活
if
!
kill
-0
$server_pid
2>/dev/null
;
then
echo
-e
"
\n
❌ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Server process died unexpectedly!"
echo
"===== LAST LOG LINES ====="
tail
-n
20
"
$log_file
"
echo
"========================="
return
1
fi
# 默认返回继续等待
return
2
}
# 3. 等待服务器启动或失败
echo
-e
"
\n
🔍 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Starting monitoring for
${
model_name
}
_tp
${
tp
}
(PID:
$SERVER_PID
)"
max_wait_seconds
=
20000
start_time
=
$(
date
+%s
)
log_file
=
"/workspace/test/inference_outputs/logs/server/
${
model_name
}
_tp
${
tp
}
_server.log"
while
true
;
do
sleep
20
# 每20秒检查一次
check_server_status
"
$log_file
"
"
$SERVER_PID
"
status
=
$?
# 成功状态
if
[
$status
-eq
0
]
;
then
break
fi
# 失败状态
if
[
$status
-eq
1
]
;
then
# 清理资源
kill
$SERVER_PID
2>/dev/null
pkill
-f
"vllm serve"
2>/dev/null
echo
"🛑 Cleaned up resources after failure"
# 直接继续下一个模型测试
continue
2
fi
# 检查超时
current_time
=
$(
date
+%s
)
elapsed
=
$((
current_time
-
start_time
))
if
[
$elapsed
-ge
$max_wait_seconds
]
;
then
echo
-e
"
\n
⏰ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Timeout waiting for server to start!"
# 清理资源
kill
$SERVER_PID
2>/dev/null
pkill
-f
"vllm serve"
2>/dev/null
echo
"🛑 Cleaned up resources after timeout"
# 直接继续下一个模型测试
continue
2
fi
echo
"Waiting... (
${
elapsed
}
s elapsed)"
done
# 4. 只有成功启动时才执行测试
echo
-e
"
\n
🚀 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] Running tests for
${
model_name
}
_tp
${
tp
}
..."
# 设置测试环境变量
export
MODEL_NAME
=
"
$model_name
"
export
MODEL_PATH
=
"
$model_path
"
export
TP
=
"
$tp
"
export
DATA_TYPE
=
"
$data_type
"
export
BATCH_LIST
=
"
$batch_list
"
export
PROMPT_PAIRS
=
"
$prompt_pairs
"
export
PORT
=
"
$port
"
# 运行测试
./test.sh
# 5. 测试完成后清理
kill
$SERVER_PID
pkill
-f
"vllm serve"
2>/dev/null
echo
"✅ [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
]
${
model_name
}
_tp
${
tp
}
test completed and cleaned up"
done
< <
(
grep
-v
'^#'
../configs/model_to_test.cfg |
grep
-v
'^$'
)
echo
-e
"
\n
📊 [
$(
date
'+%Y-%m-%d %H:%M:%S'
)
] All tests completed. Results saved to results/"
\ No newline at end of file
3_env_check-batch_onlinetests/scripts/run_envcheck.sh
0 → 100644
View file @
f6a338d7
This diff is collapsed.
Click to expand it.
3_env_check-batch_onlinetests/scripts/test.sh
0 → 100644
View file @
f6a338d7
#!/bin/bash
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
=
1
export
VLLM_NUMA_BIND
=
1
export
VLLM_RANK0_NUMA
=
0
export
VLLM_RANK1_NUMA
=
1
export
VLLM_RANK2_NUMA
=
2
export
VLLM_RANK3_NUMA
=
3
export
VLLM_RANK4_NUMA
=
4
export
VLLM_RANK5_NUMA
=
5
export
VLLM_RANK6_NUMA
=
6
export
VLLM_RANK7_NUMA
=
7
# 从环境变量读取参数
model_name
=
${
MODEL_NAME
}
model_path
=
${
MODEL_PATH
}
tp
=
${
TP
}
data_type
=
${
DATA_TYPE
}
batch_list
=
${
BATCH_LIST
}
prompt_pairs
=
${
PROMPT_PAIRS
}
port
=
${
PORT
}
# 生成结果文件名
result_file
=
"/workspace/test/inference_outputs/results/
${
model_name
}
_tp
${
tp
}
.csv"
echo
"tp,data_type,batch,prompt_tokens,completion_tokens,TOTAL_THROUGHPUT(toks/s),generate_throughput(toks/s),TTFT(ms),TPOT(ms),ITL(ms)"
>
"
$result_file
"
# 转换字符串为数组
IFS
=
' '
read
-ra
batches
<<<
"
$batch_list
"
IFS
=
','
read
-ra
pairs
<<<
"
$prompt_pairs
"
# 执行测试
for
batch
in
"
${
batches
[@]
}
"
;
do
for
pair
in
"
${
pairs
[@]
}
"
;
do
IFS
=
' '
read
-r
prompt_tokens completion_tokens
<<<
"
$pair
"
log_file
=
"/workspace/test/inference_outputs/logs/models/
${
model_name
}
_
${
tp
}
/batch_
${
batch
}
_prompt_
${
prompt_tokens
}
_completion_
${
completion_tokens
}
.log"
mkdir
-p
"
$(
dirname
"
$log_file
"
)
"
echo
"Running: batch=
$batch
, prompt=
$prompt_tokens
, completion=
$completion_tokens
"
python benchmark_serving.py
\
--backend
openai
\
--port
"
$port
"
\
--model
"
$model_path
"
\
--trust-remote-code
\
--dataset-name
random
\
--ignore-eos
\
--random-input-len
"
$prompt_tokens
"
\
--random-output-len
"
$completion_tokens
"
\
--num-prompts
"
$batch
"
\
2>&1 |
tee
"
$log_file
"
# 提取指标
TOTAL_THROUGHPUT
=
$(
grep
"^Total Token"
"
$log_file
"
|
awk
'{print $5}'
)
GEN_THROUGHPUT
=
$(
grep
"^Output token"
"
$log_file
"
|
awk
'{print $5}'
)
TTFT
=
$(
grep
"^Mean TTFT"
"
$log_file
"
|
awk
'{print $4}'
)
TPOT
=
$(
grep
"^Mean TPOT"
"
$log_file
"
|
awk
'{print $4}'
)
ITL
=
$(
grep
"^Mean ITL"
"
$log_file
"
|
awk
'{print $4}'
)
echo
"
$tp
,
$data_type
,
$batch
,
$prompt_tokens
,
$completion_tokens
,
$TOTAL_THROUGHPUT
,
$GEN_THROUGHPUT
,
$TTFT
,
$TPOT
,
$ITL
"
>>
"
$result_file
"
done
done
3_env_check-batch_onlinetests/start.sh
0 → 100644
View file @
f6a338d7
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
/public/opendas/DL_DATA/llm-models:/workspace/llms/:ro
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--network
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment