Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
easystart_v0.1
Commits
8d4db4be
Commit
8d4db4be
authored
Jun 05, 2025
by
jerrrrry
Browse files
Initial commit
parents
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
745 additions
and
0 deletions
+745
-0
1_env_check/Dockerfile
1_env_check/Dockerfile
+36
-0
1_env_check/scripts/run_envcheck.sh
1_env_check/scripts/run_envcheck.sh
+89
-0
1_env_check/start.sh
1_env_check/start.sh
+18
-0
2_env_check&model_download&llm_inference/Dockerfile
2_env_check&model_download&llm_inference/Dockerfile
+37
-0
2_env_check&model_download&llm_inference/configs/download-list.cfg
...ck&model_download&llm_inference/configs/download-list.cfg
+4
-0
2_env_check&model_download&llm_inference/configs/model_to_test.cfg
...ck&model_download&llm_inference/configs/model_to_test.cfg
+7
-0
2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
...el_download&llm_inference/scripts/benchmark_throughput.py
+0
-0
2_env_check&model_download&llm_inference/scripts/download_model.sh
...ck&model_download&llm_inference/scripts/download_model.sh
+91
-0
2_env_check&model_download&llm_inference/scripts/entrypoint.sh
..._check&model_download&llm_inference/scripts/entrypoint.sh
+16
-0
2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
...eck&model_download&llm_inference/scripts/run_benchmark.sh
+86
-0
2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
...heck&model_download&llm_inference/scripts/run_envcheck.sh
+89
-0
2_env_check&model_download&llm_inference/start.sh
2_env_check&model_download&llm_inference/start.sh
+20
-0
3_env_check&batches_llm_inference/Dockerfile
3_env_check&batches_llm_inference/Dockerfile
+36
-0
3_env_check&batches_llm_inference/configs/model_to_test.cfg
3_env_check&batches_llm_inference/configs/model_to_test.cfg
+10
-0
3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
...eck&batches_llm_inference/scripts/benchmark_throughput.py
+0
-0
3_env_check&batches_llm_inference/scripts/entrypoint.sh
3_env_check&batches_llm_inference/scripts/entrypoint.sh
+11
-0
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
+86
-0
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
+89
-0
3_env_check&batches_llm_inference/start.sh
3_env_check&batches_llm_inference/start.sh
+20
-0
No files found.
1_env_check/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/test/env_check_outputs
# 复制脚本并设置权限
COPY
./scripts/run_envcheck.sh /workspace/scripts/
# 验证脚本可执行性
RUN
ls
-l
/workspace/scripts/
&&
\
file /workspace/scripts/run_envcheck.sh
&&
\
head
-n
1 /workspace/scripts/run_envcheck.sh
# 检查shebang
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash run_envcheck.sh"
\ No newline at end of file
1_env_check/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
1_env_check/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
env_check
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
env_check
\
\ No newline at end of file
2_env_check&model_download&llm_inference/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
&&
\
mkdir
-p
/workspace/test/models
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
2_env_check&model_download&llm_inference/configs/download-list.cfg
0 → 100644
View file @
8d4db4be
# 格式: 模型ID;本地保存路径
#模型ID为modelscope官网指定的id
Qwen/Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B
\ No newline at end of file
2_env_check&model_download&llm_inference/configs/model_to_test.cfg
0 → 100644
View file @
8d4db4be
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
2_env_check&model_download&llm_inference/scripts/benchmark_throughput.py
0 → 100644
View file @
8d4db4be
This diff is collapsed.
Click to expand it.
2_env_check&model_download&llm_inference/scripts/download_model.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# ModelScope CLI批量下载脚本
# 使用说明: ./ms_download.sh -f 模型列表.cfg [-F 强制重新下载]
pip
install
modelscope
-i
https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# 参数解析
CONFIG_FILE
=
""
FORCE_DOWNLOAD
=
false
MODELSCOPE_CMD
=
"modelscope download"
while
getopts
"f:F"
opt
;
do
case
$opt
in
f
)
CONFIG_FILE
=
"
$OPTARG
"
;;
F
)
FORCE_DOWNLOAD
=
true
;;
*
)
echo
"Usage:
$0
-f config.cfg [-F]"
>
&2
exit
1
esac
done
# 检查配置文件
if
[
!
-f
"
$CONFIG_FILE
"
]
;
then
echo
"Error: Config file
$CONFIG_FILE
not found!"
>
&2
exit
1
fi
# 检查modelscope是否安装
if
!
command
-v
modelscope &> /dev/null
;
then
echo
"Error: modelscope CLI not installed. Please install with: pip install modelscope"
>
&2
exit
1
fi
# 读取配置文件
TOTAL
=
0
SUCCESS
=
0
FAILED
=
0
echo
"=== Starting batch download ==="
while
IFS
=
';'
read
-r
model_id local_dir
||
[[
-n
"
$model_id
"
]]
;
do
# 跳过空行和注释
[[
-z
"
$model_id
"
||
"
$model_id
"
=
~ ^#
]]
&&
continue
((
TOTAL++
))
# 清理变量
model_id
=
$(
echo
"
$model_id
"
| xargs
)
local_dir
=
$(
echo
"
$local_dir
"
| xargs
)
echo
-e
"
\n
[Progress]
$TOTAL
. Downloading
$model_id
"
echo
"[Location]
$local_dir
"
# 检查目录是否存在
if
[
"
$FORCE_DOWNLOAD
"
=
false
]
&&
[
-d
"
$local_dir
"
]
;
then
echo
"[Status] Skipped (already exists)"
((
SUCCESS++
))
continue
fi
# 创建目录
mkdir
-p
"
$local_dir
"
||
{
echo
"[Error] Failed to create directory
$local_dir
"
>
&2
((
FAILED++
))
continue
}
# 执行下载命令
if
$MODELSCOPE_CMD
--model
"
$model_id
"
--local_dir
"
$local_dir
"
;
then
echo
"[Status] Download successful"
((
SUCCESS++
))
else
echo
"[Error] Download failed"
>
&2
((
FAILED++
))
# 删除空目录防止残留
rmdir
"
$local_dir
"
2>/dev/null
fi
done
<
"
$CONFIG_FILE
"
# 结果统计
echo
-e
"
\n
=== Download summary ==="
echo
"Total:
$TOTAL
"
echo
"Success:
$SUCCESS
"
echo
"Failed:
$FAILED
"
# 退出状态
if
[
"
$FAILED
"
-gt
0
]
;
then
exit
1
else
exit
0
fi
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/entrypoint.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# 执行环境检查
echo
"==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 下载模型
echo
"==================== 开始模型下载 ===================="
/workspace/scripts/download_model.sh
-f
/workspace/configs/download-list.cfg
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/run_benchmark.sh
0 → 100644
View file @
8d4db4be
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCBLAS_COMPUTETYPE_FP16R
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_LAUNCH_MODE
=
GROUP
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
NCCL_MIN_NCHANNELS
=
16
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
export
LD_LIBRARY_PATH
=
/usr/local/lib/python3.10/site-packages/torch/lib/:
$LD_LIBRARY_PATH
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_RPC_TIMEOUT
=
100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG
=
"/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR
=
"/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while
IFS
=
read
-r
line
||
[[
-n
"
$line
"
]]
;
do
# 跳过注释行和空行
if
[[
"
$line
"
=
~ ^#
]]
||
[[
-z
"
$line
"
]]
;
then
continue
fi
# 解析配置行
IFS
=
';'
read
-ra
CONFIG
<<<
"
$line
"
model_name
=
"
${
CONFIG
[0]
}
"
model_path
=
"
${
CONFIG
[1]
}
"
tp
=
"
${
CONFIG
[2]
}
"
batch
=
"
${
CONFIG
[3]//,/
}
"
# 将逗号替换为空格
prompt_tokens
=
"
${
CONFIG
[4]//,/
}
"
completion_tokens
=
"
${
CONFIG
[5]//,/
}
"
dtype
=
"
${
CONFIG
[6]
}
"
max_model_len
=
"
${
CONFIG
[7]
}
"
gpu_memory_utilization
=
"
${
CONFIG
[8]
}
"
echo
"开始测试模型:
$model_name
"
echo
"模型路径:
$model_path
"
echo
"参数配置:"
echo
" tensor_parallel_size:
$tp
"
echo
" batch_sizes:
$batch
"
echo
" prompt_tokens:
$prompt_tokens
"
echo
" completion_tokens:
$completion_tokens
"
echo
" dtype:
$dtype
"
echo
" max_model_len:
$max_model_len
"
echo
" gpu_memory_utilization:
$gpu_memory_utilization
"
# 创建模型专属结果目录
model_result_dir
=
"
${
RESULTS_DIR
}
/
${
model_name
}
"
mkdir
-p
"
$model_result_dir
"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py
\
--model
"
$model_path
"
\
--tensor-parallel-size
"
$tp
"
\
--num-prompts
$batch
\
--input-len
$prompt_tokens
\
--output-len
$completion_tokens
\
--dtype
"
$dtype
"
\
--trust-remote-code
\
--max-model-len
"
$max_model_len
"
\
--gpu-memory-utilization
"
$gpu_memory_utilization
"
\
--output-json
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.txt"
\
2>&1 |
tee
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.log"
echo
"完成测试模型:
$model_name
"
echo
"结果保存在:
$model_result_dir
"
echo
"----------------------------------------"
done
<
"
$MODELS_CONFIG
"
\ No newline at end of file
2_env_check&model_download&llm_inference/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
2_env_check&model_download&llm_inference/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
$PWD
/outputs/models:/workspace/test/models/
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
3_env_check&batches_llm_inference/Dockerfile
0 → 100644
View file @
8d4db4be
# 使用官方光源基础镜像
FROM
image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN
apt-get update
&&
\
apt-get
install
-y
--no-install-recommends
\
iproute2
\
dmidecode
\
ipmitool
\
git
\
curl
\
jq
\
lshw
\
iputils-ping
\
pciutils
\
&&
rm
-rf
/var/lib/apt/lists/
*
# 创建目录结构
RUN
mkdir
-p
/workspace/scripts
&&
\
mkdir
-p
/workspace/configs
&&
\
mkdir
-p
/workspace/test/env_check_outputs
&&
\
mkdir
-p
/workspace/test/inference_outputs
# 复制脚本
COPY
./scripts/* /workspace/scripts/
COPY
./configs/* /workspace/configs/
RUN
chmod
+x /workspace/scripts/
*
RUN
chmod
+x /workspace/configs
*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR
/workspace/scripts/
# 直接执行脚本(无需cd)
CMD
bash -c "\
bash entrypoint.sh"
\ No newline at end of file
3_env_check&batches_llm_inference/configs/model_to_test.cfg
0 → 100644
View file @
8d4db4be
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-4B;/workspace/test/models/Qwen/Qwen3-4B;1;1;512;512;float16;32768;0.95
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
Qwen3-1.7B;/workspace/test/models/Qwen/Qwen3-1.7B;1;1;512;512;float16;32768;0.95
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/benchmark_throughput.py
0 → 100644
View file @
8d4db4be
This diff is collapsed.
Click to expand it.
3_env_check&batches_llm_inference/scripts/entrypoint.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
# 执行环境检查
echo
"==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo
"==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo
"==================== 所有测试完成 ===================="
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/run_benchmark.sh
0 → 100644
View file @
8d4db4be
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCBLAS_COMPUTETYPE_FP16R
=
0
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
NCCL_LAUNCH_MODE
=
GROUP
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
NCCL_MIN_NCHANNELS
=
16
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_P2P_LEVEL
=
SYS
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
export
LD_LIBRARY_PATH
=
/usr/local/lib/python3.10/site-packages/torch/lib/:
$LD_LIBRARY_PATH
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
NCCL_MIN_NCHANNELS
=
16
export
NCCL_MAX_NCHANNELS
=
16
export
VLLM_PCIE_USE_CUSTOM_ALLREDUCE
=
1
export
VLLM_RPC_TIMEOUT
=
100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG
=
"/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR
=
"/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while
IFS
=
read
-r
line
||
[[
-n
"
$line
"
]]
;
do
# 跳过注释行和空行
if
[[
"
$line
"
=
~ ^#
]]
||
[[
-z
"
$line
"
]]
;
then
continue
fi
# 解析配置行
IFS
=
';'
read
-ra
CONFIG
<<<
"
$line
"
model_name
=
"
${
CONFIG
[0]
}
"
model_path
=
"
${
CONFIG
[1]
}
"
tp
=
"
${
CONFIG
[2]
}
"
batch
=
"
${
CONFIG
[3]//,/
}
"
# 将逗号替换为空格
prompt_tokens
=
"
${
CONFIG
[4]//,/
}
"
completion_tokens
=
"
${
CONFIG
[5]//,/
}
"
dtype
=
"
${
CONFIG
[6]
}
"
max_model_len
=
"
${
CONFIG
[7]
}
"
gpu_memory_utilization
=
"
${
CONFIG
[8]
}
"
echo
"开始测试模型:
$model_name
"
echo
"模型路径:
$model_path
"
echo
"参数配置:"
echo
" tensor_parallel_size:
$tp
"
echo
" batch_sizes:
$batch
"
echo
" prompt_tokens:
$prompt_tokens
"
echo
" completion_tokens:
$completion_tokens
"
echo
" dtype:
$dtype
"
echo
" max_model_len:
$max_model_len
"
echo
" gpu_memory_utilization:
$gpu_memory_utilization
"
# 创建模型专属结果目录
model_result_dir
=
"
${
RESULTS_DIR
}
/
${
model_name
}
"
mkdir
-p
"
$model_result_dir
"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py
\
--model
"
$model_path
"
\
--tensor-parallel-size
"
$tp
"
\
--num-prompts
$batch
\
--input-len
$prompt_tokens
\
--output-len
$completion_tokens
\
--dtype
"
$dtype
"
\
--trust-remote-code
\
--max-model-len
"
$max_model_len
"
\
--gpu-memory-utilization
"
$gpu_memory_utilization
"
\
--output-json
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.txt"
\
2>&1 |
tee
"
${
model_result_dir
}
/
${
model_name
}
_tp
${
tp
}
.log"
echo
"完成测试模型:
$model_name
"
echo
"结果保存在:
$model_result_dir
"
echo
"----------------------------------------"
done
<
"
$MODELS_CONFIG
"
\ No newline at end of file
3_env_check&batches_llm_inference/scripts/run_envcheck.sh
0 → 100644
View file @
8d4db4be
#!/bin/bash
set
-eo
pipefail
# 严格错误处理
log_dir
=
"/workspace/test/env_check_outputs"
mkdir
-p
"
$log_dir
"
echo
"==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test
()
{
local
name
=
$1
shift
echo
"[RUN]
$name
"
"
$@
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
run_pipe_test
()
{
local
name
=
$1
local
cmd
=
$2
echo
"[RUN]
$name
"
bash
-c
"
$cmd
"
2>&1 |
tee
"
$log_dir
/
${
name
}
.log"
||
{
echo
"[WARN]
$name
检查失败"
|
tee
-a
"
$log_dir
/
${
name
}
.log"
return
1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi
-c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores
nproc
run_test memory_usage free
-h
run_test disk_usage
df
-h
run_test hardware_info lshw
-short
||
true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat
"lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo
||
true
echo
"==================== RCCL-TEST ===================="
cd
/workspace/test/env_check_outputs
if
command
-v
git &>/dev/null
&&
command
-v
make &>/dev/null
;
then
if
[
!
-d
rccl-tests
]
;
then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git
--depth
1
-b
master
||
exit
1
fi
cd
rccl-tests
||
exit
1
source
/opt/dtk/env.sh
if
make
MPI
=
1
MPI_HOME
=
/opt/mpi
ROCM_HOME
=
/opt/dtk
NCCL_HOME
=
/opt/dtk/rccl
\
CUSTOM_RCCL_LIB
=
/opt/dtk/rccl/lib/librccl.so
-j32
;
then
./build/all_reduce_perf
-b
8
-e
1G
-f
2
-g
8 2>&1 |
tee
"
$log_dir
/all_reduce_perf_8.log"
./build/all_reduce_perf
-b
4
-e
1G
-f
2
-g
4 2>&1 |
tee
"
$log_dir
/all_reduce_perf_4.log"
else
echo
"[ERROR] RCCL编译失败"
|
tee
"
$log_dir
/rccl_build_fail.log"
fi
cd
..
else
echo
"[WARN] 缺少git或make,跳过RCCL测试"
|
tee
"
$log_dir
/rccl_skip.log"
fi
echo
"==================== DCU-ENV-CHECK ===================="
if
[
!
-d
dcu_env_check
]
;
then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git
||
{
echo
"[ERROR] DCU环境检查代码克隆失败"
|
tee
"
$log_dir
/dcu_clone_fail.log"
exit
1
}
fi
cd
dcu_env_check
&&
{
bash system_check.sh 2>&1 |
tee
"
$log_dir
/dcu_env_check.log"
cd
..
}
||
{
echo
"[ERROR] DCU环境检查执行失败"
|
tee
"
$log_dir
/dcu_check_fail.log"
exit
1
}
echo
"==================== 检查完成 ===================="
echo
"所有日志已保存至:
$log_dir
"
ls
-lh
"
$log_dir
"
\ No newline at end of file
3_env_check&batches_llm_inference/start.sh
0 → 100644
View file @
8d4db4be
docker build
-t
vllm-test1
.
&&
\
docker run
\
-v
/usr/local/hyhal:/usr/local/hyhal:ro
\
-v
/opt/hyhal:/opt/hyhal:ro
\
-v
$PWD
/outputs/env_check_outputs:/workspace/test/env_check_outputs/
\
-v
/public/models:/workspace/test/models/
\
-v
$PWD
/outputs/inference_outputs:/workspace/test/inference_outputs/
\
--ipc
=
host
\
--cap-add
=
SYS_PTRACE
\
--group-add
video
\
--ulimit
memlock
=
-1
:-1
\
--privileged
\
--device
=
/dev/kfd
\
--device
=
/dev/mkfd
\
--device
=
/dev/dri
\
--shm-size
=
500G
\
-u
root
\
--security-opt
seccomp
=
unconfined
\
vllm-test1
\
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment