Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
easystart_v0.1
Commits
8d4db4be
Commit
8d4db4be
authored
Jun 05, 2025
by
jerrrrry
Browse files
Initial commit
parents
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
2083 additions
and
0 deletions
+2083
-0
1_env_check/Dockerfile
1_env_check/Dockerfile
+36
-0
1_env_check/scripts/run_envcheck.sh
1_env_check/scripts/run_envcheck.sh
+89
-0
1_env_check/start.sh
1_env_check/start.sh
+18
-0
2_env_check&model_download&llm_inference/Dockerfile
2_env_check&model_download&llm_inference/Dockerfile
+37
-0
2_env_check&model_download&llm_inference/configs/download-list.cfg
...ck&model_download&llm_inference/configs/download-list.cfg
+4
-0
2_env_check&model_download&llm_inference/configs/model_to_test.cfg
...ck&model_download&llm_inference/configs/model_to_test.cfg
+7
-0
2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
...el_download&llm_inference/scripts/benchmark_throughput.py
+669
-0
2_env_check&model_download&llm_inference/scripts/download_model.sh
...ck&model_download&llm_inference/scripts/download_model.sh
+91
-0
2_env_check&model_download&llm_inference/scripts/entrypoint.sh
..._check&model_download&llm_inference/scripts/entrypoint.sh
+16
-0
2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
...eck&model_download&llm_inference/scripts/run_benchmark.sh
+86
-0
2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
...heck&model_download&llm_inference/scripts/run_envcheck.sh
+89
-0
2_env_check&model_download&llm_inference/start.sh
2_env_check&model_download&llm_inference/start.sh
+20
-0
3_env_check&batches_llm_inference/Dockerfile
3_env_check&batches_llm_inference/Dockerfile
+36
-0
3_env_check&batches_llm_inference/configs/model_to_test.cfg
3_env_check&batches_llm_inference/configs/model_to_test.cfg
+10
-0
3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
...eck&batches_llm_inference/scripts/benchmark_throughput.py
+669
-0
3_env_check&batches_llm_inference/scripts/entrypoint.sh
3_env_check&batches_llm_inference/scripts/entrypoint.sh
+11
-0
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
+86
-0
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
+89
-0
3_env_check&batches_llm_inference/start.sh
3_env_check&batches_llm_inference/start.sh
+20
-0
No files found.
1_env_check/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/test/env_check_outputs
# 复制脚本并设置权限
COPY
./scripts/run_envcheck.sh /workspace/scripts/
# 验证脚本可执行性
RUN
ls
-l
/workspace/scripts/
&&
\
file /workspace/scripts/run_envcheck.sh
&&
\
head
-n
1 /workspace/scripts/run_envcheck.sh
# 检查shebang
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash run_envcheck.sh"
\ No newline at end of file
1_env_check/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
1_env_check/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
env_check
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
env_check
\
\ No newline at end of file
2_env_check&model_download&llm_inference/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
&&
\
mkdir
-p
/workspace/test/models
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
2_env_check&model_download&llm_inference/configs/download-list.cfg
0 → 100644
View file @
8d4db4be
# 格式: 模型ID;本地保存路径
#模型ID为modelscope官网指定的id
Qwen/Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B
\ No newline at end of file
2_env_check&model_download&llm_inference/configs/model_to_test.cfg
0 → 100644
View file @
8d4db4be
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
0 → 100644
View file @
8d4db4be
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import
argparse
import
dataclasses
import
json
import
random
import
time
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
PIL
import
Image
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptType
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
@
dataclasses
.
dataclass
class
SampleRequest
:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model
=
model
.
lower
()
if
"pixtral"
in
model
:
return
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
model
:
str
=
args
.
model
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Only keep the first two turns of each conversation.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
completion
=
data
[
"conversations"
][
1
][
"value"
]
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
if
"image"
in
data
:
multi_modal_data
=
multi_modal_data
or
{}
image_path
=
data
[
"image"
]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert
isinstance
(
image_path
,
str
),
"Only support single image input"
try
:
multi_modal_data
[
"image"
]
=
Image
.
open
(
image_path
).
convert
(
"RGB"
)
except
FileNotFoundError
:
# Ignore datapoint where asset is missing
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
request_tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
def
run_vllm
(
requests_json
:
List
[
SampleRequest
],
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
# warmup
warmup_sampling_params
=
SamplingParams
(
n
=
args
.
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
10
,
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
use_tqdm
=
False
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
use_beam_search
=
False
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
request
in
requests
:
assert
request
.
expected_output_len
==
output_len
start
=
time
.
perf_counter
()
real_output
=
llm
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
,
))
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
total_num_tokens
=
sum
(
request
.
prompt_len
+
request
.
expected_output_len
for
request
in
requests
)
total_output_tokens
=
sum
(
request
.
expected_output_len
for
request
in
requests
)
# ttft_mean = np.mean(total_ttfts)
# ttft_median = np.median(total_ttfts or 0)
# ttft_p99 = np.percentile(total_ttfts or 0, 99)
# tpop_mean = np.mean(total_tpops)
# tpop_median = np.median(total_tpops or 0)
# tpop_p99 = np.percentile(total_tpops or 0, 99)
# output_token_throughput_mean = np.mean(total_output_token_throughput)
# output_token_throughput_median = np.median(total_output_token_throughput or 0)
# output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
# inout_token_throughput_mean = np.mean(total_inout_token_throughput)
# inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
# inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]
*
1000
:
.
2
f
}
ms"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]
*
1000
:
.
2
f
}
ms"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
True
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
request
.
prompt
for
request
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
requests
=
[]
for
_
in
range
(
ELEprompt
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
for
_
in
range
(
ELEinput
)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
request_tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
ELEinput
:
break
# Adjust length based on difference
diff
=
ELEinput
-
tokenized_len
if
diff
>
0
:
candidate_ids
.
extend
([
random
.
randint
(
100
,
vocab_size
-
100
)
for
_
in
range
(
diff
)
])
else
:
candidate_ids
=
candidate_ids
[:
diff
]
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
ELEinput
,
expected_output_len
=
ELEoutput
,
lora_request
=
lora_request
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
requests
,
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
))
else
:
info_json
=
run_vllm
(
requests_json
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
if
is_multi_modal
:
print
(
"
\033
[91mWARNING
\033
[0m: Multi-modal request detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# json.dump(info_json, f, indent=4)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
nargs
=
"*"
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
2_env_check&model_download&llm_inference/scripts/download_model.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# ModelScope CLI批量下载脚本
# 使用说明: ./ms_download.sh -f 模型列表.cfg [-F 强制重新下载]
pip
install
modelscope
-i
https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# 参数解析
CONFIG_FILE
=
""
FORCE_DOWNLOAD
=
false
MODELSCOPE_CMD
=
"modelscope download"
while
getopts
"f:F"
opt
;
do
case
$opt
in
f
)
CONFIG_FILE
=
"
$OPTARG
"
;;
F
)
FORCE_DOWNLOAD
=
true
;;
*
)
echo
"Usage:
$0
-f config.cfg [-F]"
>
&2
exit
1
esac
done
# 检查配置文件
if
[
!
-f
"
$CONFIG_FILE
"
]
;
then
echo
"Error: Config file
$CONFIG_FILE
not found!"
>
&2
exit
1
fi
# 检查modelscope是否安装
if
!
command
-v
modelscope &> /dev/null
;
then
echo
"Error: modelscope CLI not installed. Please install with: pip install modelscope"
>
&2
exit
1
fi
# 读取配置文件
TOTAL
=
0
SUCCESS
=
0
FAILED
=
0
echo
"=== Starting batch download ==="
while
IFS
=
';'
read
-r
model_id local_dir
||
[[
-n
"
$model_id
"
]]
;
do
# 跳过空行和注释
[[
-z
"
$model_id
"
||
"
$model_id
"
=
~ ^#
]]
&&
continue
((
TOTAL++
))
# 清理变量
model_id
=
$(
echo
"
$model_id
"
| xargs
)
local_dir
=
$(
echo
"
$local_dir
"
| xargs
)
echo
-e
"
\n
[Progress]
$TOTAL
. Downloading
$model_id
"
echo
"[Location]
$local_dir
"
# 检查目录是否存在
if
[
"
$FORCE_DOWNLOAD
"
=
false
]
&&
[
-d
"
$local_dir
"
]
;
then
echo
"[Status] Skipped (already exists)"
((
SUCCESS++
))
continue
fi
# 创建目录
mkdir
-p
"
$local_dir
"
||
{
echo
"[Error] Failed to create directory
$local_dir
"
>
&2
((
FAILED++
))
continue
}
# 执行下载命令
if
$MODELSCOPE_CMD
--model
"
$model_id
"
--local_dir
"
$local_dir
"
;
then
echo
"[Status] Download successful"
((
SUCCESS++
))
else
echo
"[Error] Download failed"
>
&2
((
FAILED++
))
# 删除空目录防止残留
rmdir
"
$local_dir
"
2>/dev/null
fi
done
<
"
$CONFIG_FILE
"
# 结果统计
echo
-e
"
\n
=== Download summary ==="
echo
"Total:
$TOTAL
"
echo
"Success:
$SUCCESS
"
echo
"Failed:
$FAILED
"
# 退出状态
if
[
"
$FAILED
"
-gt
0
]
;
then
exit
1
else
exit
0
fi
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/entrypoint.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# 执行环境检查
echo
"==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 下载模型
echo
"==================== 开始模型下载 ===================="
/workspace/scripts/download_model.sh
-f
/workspace/configs/download-list.cfg
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
0 → 100644
View file @
8d4db4be
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCBLAS_COMPUTETYPE_FP16R
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_LAUNCH_MODE
=
GROUP
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
NCCL_MIN_NCHANNELS
=
16
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
export
LD_LIBRARY_PATH
=
/usr/local/lib/python3.10/site-packages/torch/lib/:
$LD_LIBRARY_PATH
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_RPC_TIMEOUT
=
100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG
=
"/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR
=
"/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while
IFS
=
read
-r
line
||
[[
-n
"
$line
"
]]
;
do
# 跳过注释行和空行
if
[[
"
$line
"
=
~ ^#
]]
||
[[
-z
"
$line
"
]]
;
then
continue
fi
# 解析配置行
IFS
=
';'
read
-ra
CONFIG
<<<
"
$line
"
model_name
=
"
${
CONFIG
[0]
}
"
model_path
=
"
${
CONFIG
[1]
}
"
tp
=
"
${
CONFIG
[2]
}
"
batch
=
"
${
CONFIG
[3]//,/
}
"
# 将逗号替换为空格
prompt_tokens
=
"
${
CONFIG
[4]//,/
}
"
completion_tokens
=
"
${
CONFIG
[5]//,/
}
"
dtype
=
"
${
CONFIG
[6]
}
"
max_model_len
=
"
${
CONFIG
[7]
}
"
gpu_memory_utilization
=
"
${
CONFIG
[8]
}
"
echo
"开始测试模型:
$model_name
"
echo
"模型路径:
$model_path
"
echo
"参数配置:"
echo
" tensor_parallel_size:
$tp
"
echo
" batch_sizes:
$batch
"
echo
" prompt_tokens:
$prompt_tokens
"
echo
" completion_tokens:
$completion_tokens
"
echo
" dtype:
$dtype
"
echo
" max_model_len:
$max_model_len
"
echo
" gpu_memory_utilization:
$gpu_memory_utilization
"
# 创建模型专属结果目录
model_result_dir
=
"
${
RESULTS_DIR
}
/
${
model_name
}
"
mkdir
-p
"
$model_result_dir
"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py
\
--model
"
$model_path
"
\
--tensor-parallel-size
"
$tp
"
\
--num-prompts
$batch
\
--input-len
$prompt_tokens
\
--output-len
$completion_tokens
\
--dtype
"
$dtype
"
\
--trust-remote-code
\
--max-model-len
"
$max_model_len
"
\
--gpu-memory-utilization
"
$gpu_memory_utilization
"
\
--output-json
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.txt"
\
2>&1 |
tee
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.log"
echo
"完成测试模型:
$model_name
"
echo
"结果保存在:
$model_result_dir
"
echo
"----------------------------------------"
done
<
"
$MODELS_CONFIG
"
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
2_env_check&model_download&llm_inference/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
$PWD
/outputs/models:/workspace/test/models/
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
3_env_check&batches_llm_inference/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
3_env_check&batches_llm_inference/configs/model_to_test.cfg
0 → 100644
View file @
8d4db4be
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-4B;/workspace/test/models/Qwen/Qwen3-4B;1;1;512;512;float16;32768;0.95
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
Qwen3-1.7B;/workspace/test/models/Qwen/Qwen3-1.7B;1;1;512;512;float16;32768;0.95
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
0 → 100644
View file @
8d4db4be
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import
argparse
import
dataclasses
import
json
import
random
import
time
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
PIL
import
Image
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptType
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
@
dataclasses
.
dataclass
class
SampleRequest
:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model
=
model
.
lower
()
if
"pixtral"
in
model
:
return
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
model
:
str
=
args
.
model
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Only keep the first two turns of each conversation.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
completion
=
data
[
"conversations"
][
1
][
"value"
]
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
if
"image"
in
data
:
multi_modal_data
=
multi_modal_data
or
{}
image_path
=
data
[
"image"
]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert
isinstance
(
image_path
,
str
),
"Only support single image input"
try
:
multi_modal_data
[
"image"
]
=
Image
.
open
(
image_path
).
convert
(
"RGB"
)
except
FileNotFoundError
:
# Ignore datapoint where asset is missing
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
request_tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
def
run_vllm
(
requests_json
:
List
[
SampleRequest
],
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
# warmup
warmup_sampling_params
=
SamplingParams
(
n
=
args
.
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
10
,
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
use_tqdm
=
False
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
use_beam_search
=
False
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
request
in
requests
:
assert
request
.
expected_output_len
==
output_len
start
=
time
.
perf_counter
()
real_output
=
llm
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
,
))
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
total_num_tokens
=
sum
(
request
.
prompt_len
+
request
.
expected_output_len
for
request
in
requests
)
total_output_tokens
=
sum
(
request
.
expected_output_len
for
request
in
requests
)
# ttft_mean = np.mean(total_ttfts)
# ttft_median = np.median(total_ttfts or 0)
# ttft_p99 = np.percentile(total_ttfts or 0, 99)
# tpop_mean = np.mean(total_tpops)
# tpop_median = np.median(total_tpops or 0)
# tpop_p99 = np.percentile(total_tpops or 0, 99)
# output_token_throughput_mean = np.mean(total_output_token_throughput)
# output_token_throughput_median = np.median(total_output_token_throughput or 0)
# output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
# inout_token_throughput_mean = np.mean(total_inout_token_throughput)
# inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
# inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]
*
1000
:
.
2
f
}
ms"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]
*
1000
:
.
2
f
}
ms"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]
*
1000
:
.
2
f
}
ms"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
True
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
request
.
prompt
for
request
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
requests
=
[]
for
_
in
range
(
ELEprompt
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
for
_
in
range
(
ELEinput
)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
request_tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
ELEinput
:
break
# Adjust length based on difference
diff
=
ELEinput
-
tokenized_len
if
diff
>
0
:
candidate_ids
.
extend
([
random
.
randint
(
100
,
vocab_size
-
100
)
for
_
in
range
(
diff
)
])
else
:
candidate_ids
=
candidate_ids
[:
diff
]
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
ELEinput
,
expected_output_len
=
ELEoutput
,
lora_request
=
lora_request
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
requests
,
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
))
else
:
info_json
=
run_vllm
(
requests_json
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
if
is_multi_modal
:
print
(
"
\033
[91mWARNING
\033
[0m: Multi-modal request detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# json.dump(info_json, f, indent=4)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
nargs
=
"*"
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
3_env_check&batches_llm_inference/scripts/entrypoint.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# 执行环境检查
echo
"==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
0 → 100644
View file @
8d4db4be
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCBLAS_COMPUTETYPE_FP16R
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_LAUNCH_MODE
=
GROUP
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
NCCL_MIN_NCHANNELS
=
16
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
export
LD_LIBRARY_PATH
=
/usr/local/lib/python3.10/site-packages/torch/lib/:
$LD_LIBRARY_PATH
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_RPC_TIMEOUT
=
100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG
=
"/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR
=
"/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while
IFS
=
read
-r
line
||
[[
-n
"
$line
"
]]
;
do
# 跳过注释行和空行
if
[[
"
$line
"
=
~ ^#
]]
||
[[
-z
"
$line
"
]]
;
then
continue
fi
# 解析配置行
IFS
=
';'
read
-ra
CONFIG
<<<
"
$line
"
model_name
=
"
${
CONFIG
[0]
}
"
model_path
=
"
${
CONFIG
[1]
}
"
tp
=
"
${
CONFIG
[2]
}
"
batch
=
"
${
CONFIG
[3]//,/
}
"
# 将逗号替换为空格
prompt_tokens
=
"
${
CONFIG
[4]//,/
}
"
completion_tokens
=
"
${
CONFIG
[5]//,/
}
"
dtype
=
"
${
CONFIG
[6]
}
"
max_model_len
=
"
${
CONFIG
[7]
}
"
gpu_memory_utilization
=
"
${
CONFIG
[8]
}
"
echo
"开始测试模型:
$model_name
"
echo
"模型路径:
$model_path
"
echo
"参数配置:"
echo
" tensor_parallel_size:
$tp
"
echo
" batch_sizes:
$batch
"
echo
" prompt_tokens:
$prompt_tokens
"
echo
" completion_tokens:
$completion_tokens
"
echo
" dtype:
$dtype
"
echo
" max_model_len:
$max_model_len
"
echo
" gpu_memory_utilization:
$gpu_memory_utilization
"
# 创建模型专属结果目录
model_result_dir
=
"
${
RESULTS_DIR
}
/
${
model_name
}
"
mkdir
-p
"
$model_result_dir
"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py
\
--model
"
$model_path
"
\
--tensor-parallel-size
"
$tp
"
\
--num-prompts
$batch
\
--input-len
$prompt_tokens
\
--output-len
$completion_tokens
\
--dtype
"
$dtype
"
\
--trust-remote-code
\
--max-model-len
"
$max_model_len
"
\
--gpu-memory-utilization
"
$gpu_memory_utilization
"
\
--output-json
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.txt"
\
2>&1 |
tee
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.log"
echo
"完成测试模型:
$model_name
"
echo
"结果保存在:
$model_result_dir
"
echo
"----------------------------------------"
done
<
"
$MODELS_CONFIG
"
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
3_env_check&batches_llm_inference/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
/public/models:/workspace/test/models/
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment