Commit 8d4db4be authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
# 使用官方光源基础镜像
FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN apt-get update && \
apt-get install -y --no-install-recommends \
iproute2 \
dmidecode \
ipmitool \
git \
curl \
jq \
lshw \
iputils-ping \
pciutils \
&& rm -rf /var/lib/apt/lists/*
# 创建目录结构
RUN mkdir -p /workspace/scripts && \
mkdir -p /workspace/test/env_check_outputs
# 复制脚本并设置权限
COPY ./scripts/run_envcheck.sh /workspace/scripts/
# 验证脚本可执行性
RUN ls -l /workspace/scripts/ && \
file /workspace/scripts/run_envcheck.sh && \
head -n 1 /workspace/scripts/run_envcheck.sh # 检查shebang
# 设置工作目录(建议直接设为脚本目录)
WORKDIR /workspace/scripts/
# 直接执行脚本(无需cd)
CMD bash -c "\
bash run_envcheck.sh"
\ No newline at end of file
#!/bin/bash
set -eo pipefail # 严格错误处理
log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"
echo "==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test() {
local name=$1
shift
echo "[RUN] $name"
"$@" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
run_pipe_test() {
local name=$1
local cmd=$2
echo "[RUN] $name"
bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true
echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs
if command -v git &>/dev/null && command -v make &>/dev/null; then
if [ ! -d rccl-tests ]; then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
fi
cd rccl-tests || exit 1
source /opt/dtk/env.sh
if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
else
echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
fi
cd ..
else
echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi
echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
exit 1
}
fi
cd dcu_env_check && {
bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
cd ..
} || {
echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
exit 1
}
echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"
\ No newline at end of file
docker build -t env_check . && \
docker run \
-v /usr/local/hyhal:/usr/local/hyhal:ro \
-v /opt/hyhal:/opt/hyhal:ro \
-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/ \
--ipc=host \
--cap-add=SYS_PTRACE \
--group-add video \
--ulimit memlock=-1:-1 \
--privileged \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
--shm-size=500G \
-u root \
--security-opt seccomp=unconfined \
env_check \
\ No newline at end of file
# 使用官方光源基础镜像
FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN apt-get update && \
apt-get install -y --no-install-recommends \
iproute2 \
dmidecode \
ipmitool \
git \
curl \
jq \
lshw \
iputils-ping \
pciutils \
&& rm -rf /var/lib/apt/lists/*
# 创建目录结构
RUN mkdir -p /workspace/scripts && \
mkdir -p /workspace/configs && \
mkdir -p /workspace/test/env_check_outputs && \
mkdir -p /workspace/test/inference_outputs && \
mkdir -p /workspace/test/models
# 复制脚本
COPY ./scripts/* /workspace/scripts/
COPY ./configs/* /workspace/configs/
RUN chmod +x /workspace/scripts/*
RUN chmod +x /workspace/configs*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR /workspace/scripts/
# 直接执行脚本(无需cd)
CMD bash -c "\
bash entrypoint.sh"
\ No newline at end of file
# 格式: 模型ID;本地保存路径
#模型ID为modelscope官网指定的id
Qwen/Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B
\ No newline at end of file
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
#!/bin/bash
# ModelScope CLI批量下载脚本
# 使用说明: ./ms_download.sh -f 模型列表.cfg [-F 强制重新下载]
pip install modelscope -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# 参数解析
CONFIG_FILE=""
FORCE_DOWNLOAD=false
MODELSCOPE_CMD="modelscope download"
while getopts "f:F" opt; do
case $opt in
f) CONFIG_FILE="$OPTARG" ;;
F) FORCE_DOWNLOAD=true ;;
*) echo "Usage: $0 -f config.cfg [-F]" >&2
exit 1
esac
done
# 检查配置文件
if [ ! -f "$CONFIG_FILE" ]; then
echo "Error: Config file $CONFIG_FILE not found!" >&2
exit 1
fi
# 检查modelscope是否安装
if ! command -v modelscope &> /dev/null; then
echo "Error: modelscope CLI not installed. Please install with: pip install modelscope" >&2
exit 1
fi
# 读取配置文件
TOTAL=0
SUCCESS=0
FAILED=0
echo "=== Starting batch download ==="
while IFS=';' read -r model_id local_dir || [[ -n "$model_id" ]]; do
# 跳过空行和注释
[[ -z "$model_id" || "$model_id" =~ ^# ]] && continue
((TOTAL++))
# 清理变量
model_id=$(echo "$model_id" | xargs)
local_dir=$(echo "$local_dir" | xargs)
echo -e "\n[Progress] $TOTAL. Downloading $model_id"
echo "[Location] $local_dir"
# 检查目录是否存在
if [ "$FORCE_DOWNLOAD" = false ] && [ -d "$local_dir" ]; then
echo "[Status] Skipped (already exists)"
((SUCCESS++))
continue
fi
# 创建目录
mkdir -p "$local_dir" || {
echo "[Error] Failed to create directory $local_dir" >&2
((FAILED++))
continue
}
# 执行下载命令
if $MODELSCOPE_CMD --model "$model_id" --local_dir "$local_dir"; then
echo "[Status] Download successful"
((SUCCESS++))
else
echo "[Error] Download failed" >&2
((FAILED++))
# 删除空目录防止残留
rmdir "$local_dir" 2>/dev/null
fi
done < "$CONFIG_FILE"
# 结果统计
echo -e "\n=== Download summary ==="
echo "Total: $TOTAL"
echo "Success: $SUCCESS"
echo "Failed: $FAILED"
# 退出状态
if [ "$FAILED" -gt 0 ]; then
exit 1
else
exit 0
fi
\ No newline at end of file
#!/bin/bash
# 执行环境检查
echo "==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 下载模型
echo "==================== 开始模型下载 ===================="
/workspace/scripts/download_model.sh -f /workspace/configs/download-list.cfg
# 运行性能测试
echo "==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo "==================== 所有测试完成 ===================="
\ No newline at end of file
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCBLAS_COMPUTETYPE_FP16R=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_LAUNCH_MODE=GROUP
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=16
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_P2P_LEVEL=SYS
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export SENDRECV_STREAM_WITH_COMPUTE=1
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_RPC_TIMEOUT=100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG="/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR="/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while IFS= read -r line || [[ -n "$line" ]]; do
# 跳过注释行和空行
if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
continue
fi
# 解析配置行
IFS=';' read -ra CONFIG <<< "$line"
model_name="${CONFIG[0]}"
model_path="${CONFIG[1]}"
tp="${CONFIG[2]}"
batch="${CONFIG[3]//,/ }" # 将逗号替换为空格
prompt_tokens="${CONFIG[4]//,/ }"
completion_tokens="${CONFIG[5]//,/ }"
dtype="${CONFIG[6]}"
max_model_len="${CONFIG[7]}"
gpu_memory_utilization="${CONFIG[8]}"
echo "开始测试模型: $model_name"
echo "模型路径: $model_path"
echo "参数配置:"
echo " tensor_parallel_size: $tp"
echo " batch_sizes: $batch"
echo " prompt_tokens: $prompt_tokens"
echo " completion_tokens: $completion_tokens"
echo " dtype: $dtype"
echo " max_model_len: $max_model_len"
echo " gpu_memory_utilization: $gpu_memory_utilization"
# 创建模型专属结果目录
model_result_dir="${RESULTS_DIR}/${model_name}"
mkdir -p "$model_result_dir"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py \
--model "$model_path" \
--tensor-parallel-size "$tp" \
--num-prompts $batch \
--input-len $prompt_tokens \
--output-len $completion_tokens \
--dtype "$dtype" \
--trust-remote-code \
--max-model-len "$max_model_len" \
--gpu-memory-utilization "$gpu_memory_utilization" \
--output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
echo "完成测试模型: $model_name"
echo "结果保存在: $model_result_dir"
echo "----------------------------------------"
done < "$MODELS_CONFIG"
\ No newline at end of file
#!/bin/bash
set -eo pipefail # 严格错误处理
log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"
echo "==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test() {
local name=$1
shift
echo "[RUN] $name"
"$@" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
run_pipe_test() {
local name=$1
local cmd=$2
echo "[RUN] $name"
bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true
echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs
if command -v git &>/dev/null && command -v make &>/dev/null; then
if [ ! -d rccl-tests ]; then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
fi
cd rccl-tests || exit 1
source /opt/dtk/env.sh
if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
else
echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
fi
cd ..
else
echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi
echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
exit 1
}
fi
cd dcu_env_check && {
bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
cd ..
} || {
echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
exit 1
}
echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"
\ No newline at end of file
docker build -t vllm-test1 . && \
docker run \
-v /usr/local/hyhal:/usr/local/hyhal:ro \
-v /opt/hyhal:/opt/hyhal:ro \
-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/ \
-v $PWD/outputs/models:/workspace/test/models/ \
-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/ \
--ipc=host \
--cap-add=SYS_PTRACE \
--group-add video \
--ulimit memlock=-1:-1 \
--privileged \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
--shm-size=500G \
-u root \
--security-opt seccomp=unconfined \
vllm-test1 \
\ No newline at end of file
# 使用官方光源基础镜像
FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
# 安装基础工具
RUN apt-get update && \
apt-get install -y --no-install-recommends \
iproute2 \
dmidecode \
ipmitool \
git \
curl \
jq \
lshw \
iputils-ping \
pciutils \
&& rm -rf /var/lib/apt/lists/*
# 创建目录结构
RUN mkdir -p /workspace/scripts && \
mkdir -p /workspace/configs && \
mkdir -p /workspace/test/env_check_outputs && \
mkdir -p /workspace/test/inference_outputs
# 复制脚本
COPY ./scripts/* /workspace/scripts/
COPY ./configs/* /workspace/configs/
RUN chmod +x /workspace/scripts/*
RUN chmod +x /workspace/configs*
# 设置工作目录(建议直接设为脚本目录)
WORKDIR /workspace/scripts/
# 直接执行脚本(无需cd)
CMD bash -c "\
bash entrypoint.sh"
\ No newline at end of file
# 格式说明:
# 模型名称;模型路径;tp;batch;prompt_tokens;completion_tokens;dtype;max_model_len;gpu_memory_utilization
#模型路径为docker容器内的路径
# 多个值用逗号分隔
Qwen3-4B;/workspace/test/models/Qwen/Qwen3-4B;1;1;512;512;float16;32768;0.95
Qwen3-0.6B;/workspace/test/models/Qwen/Qwen3-0.6B;1;1;512;512;float16;32768;0.95
Qwen3-1.7B;/workspace/test/models/Qwen/Qwen3-1.7B;1;1;512;512;float16;32768;0.95
\ No newline at end of file
#!/bin/bash
# 执行环境检查
echo "==================== 开始系统环境检查 ===================="
/workspace/scripts/run_envcheck.sh
# 运行性能测试
echo "==================== 开始性能测试 ===================="
/workspace/scripts/run_benchmark.sh
echo "==================== 所有测试完成 ===================="
\ No newline at end of file
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCBLAS_COMPUTETYPE_FP16R=0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_LAUNCH_MODE=GROUP
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MAX_NCHANNELS=16
export NCCL_MIN_NCHANNELS=16
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_P2P_LEVEL=SYS
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export SENDRECV_STREAM_WITH_COMPUTE=1
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/torch/lib/:$LD_LIBRARY_PATH
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export NCCL_MIN_NCHANNELS=16
export NCCL_MAX_NCHANNELS=16
export VLLM_PCIE_USE_CUSTOM_ALLREDUCE=1
export VLLM_RPC_TIMEOUT=100000
#!/bin/bash
# 模型配置文件路径
MODELS_CONFIG="/workspace/configs/model_to_test.cfg"
# 结果目录
RESULTS_DIR="/workspace/test/inference_outputs"
# 读取配置文件,跳过注释和空行
while IFS= read -r line || [[ -n "$line" ]]; do
# 跳过注释行和空行
if [[ "$line" =~ ^# ]] || [[ -z "$line" ]]; then
continue
fi
# 解析配置行
IFS=';' read -ra CONFIG <<< "$line"
model_name="${CONFIG[0]}"
model_path="${CONFIG[1]}"
tp="${CONFIG[2]}"
batch="${CONFIG[3]//,/ }" # 将逗号替换为空格
prompt_tokens="${CONFIG[4]//,/ }"
completion_tokens="${CONFIG[5]//,/ }"
dtype="${CONFIG[6]}"
max_model_len="${CONFIG[7]}"
gpu_memory_utilization="${CONFIG[8]}"
echo "开始测试模型: $model_name"
echo "模型路径: $model_path"
echo "参数配置:"
echo " tensor_parallel_size: $tp"
echo " batch_sizes: $batch"
echo " prompt_tokens: $prompt_tokens"
echo " completion_tokens: $completion_tokens"
echo " dtype: $dtype"
echo " max_model_len: $max_model_len"
echo " gpu_memory_utilization: $gpu_memory_utilization"
# 创建模型专属结果目录
model_result_dir="${RESULTS_DIR}/${model_name}"
mkdir -p "$model_result_dir"
# 运行基准测试
python /workspace/scripts/benchmark_throughput.py \
--model "$model_path" \
--tensor-parallel-size "$tp" \
--num-prompts $batch \
--input-len $prompt_tokens \
--output-len $completion_tokens \
--dtype "$dtype" \
--trust-remote-code \
--max-model-len "$max_model_len" \
--gpu-memory-utilization "$gpu_memory_utilization" \
--output-json "${model_result_dir}/${model_name}_tp${tp}.txt" \
2>&1 | tee "${model_result_dir}/${model_name}_tp${tp}.log"
echo "完成测试模型: $model_name"
echo "结果保存在: $model_result_dir"
echo "----------------------------------------"
done < "$MODELS_CONFIG"
\ No newline at end of file
#!/bin/bash
set -eo pipefail # 严格错误处理
log_dir="/workspace/test/env_check_outputs"
mkdir -p "$log_dir"
echo "==================== 开始系统环境检查 ===================="
# 基础检查函数
run_test() {
local name=$1
shift
echo "[RUN] $name"
"$@" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
run_pipe_test() {
local name=$1
local cmd=$2
echo "[RUN] $name"
bash -c "$cmd" 2>&1 | tee "$log_dir/${name}.log" || {
echo "[WARN] $name 检查失败" | tee -a "$log_dir/${name}.log"
return 1
}
}
# 系统基础检查
run_test rocm_bandwidth_test rocm-bandwidth-test
run_test hy_smi hy-smi
run_test hy_smi_config hy-smi -c
run_test pip_list pip list
run_test cpu_info lscpu
run_test cpu_cores nproc
run_test memory_usage free -h
run_test disk_usage df -h
run_test hardware_info lshw -short || true
run_test network_interfaces ip a
run_test ibstat ibstat
run_test ibdev2netdev ibdev2netdev
run_pipe_test ACS_stat "lspci -vvv | grep -i acsct"
run_test rocm_info rocminfo || true
echo "==================== RCCL-TEST ===================="
cd /workspace/test/env_check_outputs
if command -v git &>/dev/null && command -v make &>/dev/null; then
if [ ! -d rccl-tests ]; then
git clone https://www.ghproxy.cn/github.com/ROCm/rccl-tests.git --depth 1 -b master || exit 1
fi
cd rccl-tests || exit 1
source /opt/dtk/env.sh
if make MPI=1 MPI_HOME=/opt/mpi ROCM_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl \
CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j32; then
./build/all_reduce_perf -b 8 -e 1G -f 2 -g 8 2>&1 | tee "$log_dir/all_reduce_perf_8.log"
./build/all_reduce_perf -b 4 -e 1G -f 2 -g 4 2>&1 | tee "$log_dir/all_reduce_perf_4.log"
else
echo "[ERROR] RCCL编译失败" | tee "$log_dir/rccl_build_fail.log"
fi
cd ..
else
echo "[WARN] 缺少git或make,跳过RCCL测试" | tee "$log_dir/rccl_skip.log"
fi
echo "==================== DCU-ENV-CHECK ===================="
if [ ! -d dcu_env_check ]; then
git clone http://developer.sourcefind.cn/codes/OpenDAS/dcu_env_check.git || {
echo "[ERROR] DCU环境检查代码克隆失败" | tee "$log_dir/dcu_clone_fail.log"
exit 1
}
fi
cd dcu_env_check && {
bash system_check.sh 2>&1 | tee "$log_dir/dcu_env_check.log"
cd ..
} || {
echo "[ERROR] DCU环境检查执行失败" | tee "$log_dir/dcu_check_fail.log"
exit 1
}
echo "==================== 检查完成 ===================="
echo "所有日志已保存至: $log_dir"
ls -lh "$log_dir"
\ No newline at end of file
docker build -t vllm-test1 . && \
docker run \
-v /usr/local/hyhal:/usr/local/hyhal:ro \
-v /opt/hyhal:/opt/hyhal:ro \
-v $PWD/outputs/env_check_outputs:/workspace/test/env_check_outputs/ \
-v /public/models:/workspace/test/models/ \
-v $PWD/outputs/inference_outputs:/workspace/test/inference_outputs/ \
--ipc=host \
--cap-add=SYS_PTRACE \
--group-add video \
--ulimit memlock=-1:-1 \
--privileged \
--device=/dev/kfd \
--device=/dev/mkfd \
--device=/dev/dri \
--shm-size=500G \
-u root \
--security-opt seccomp=unconfined \
vllm-test1 \
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment