Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
gpu-base-image-build
Commits
d445a280
"server/text_generation_server/models/t5.py" did not exist on "c6e8b9442b1fcf7bbbe4be58fcd85047f69e4112"
Commit
d445a280
authored
Oct 29, 2024
by
chenpangpang
Browse files
feat: 稳定版本的tensorflow分支
parent
5e6e34ed
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
181 additions
and
140 deletions
+181
-140
auto_build.py
auto_build.py
+175
-134
build_space/Dockerfile.jupyterlab_ubuntu
build_space/Dockerfile.jupyterlab_ubuntu
+6
-6
No files found.
auto_build.py
View file @
d445a280
...
...
@@ -2,187 +2,228 @@ import pandas as pd
import
re
import
subprocess
import
os
import
sys
import
shutil
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
concurrent.futures
import
ThreadPoolExecutor
,
wait
,
ALL_COMPLETED
import
argparse
import
logging
# 检查命令行参数,确保提供了 Excel 文件路径
if
len
(
sys
.
argv
)
<
2
:
print
(
"请提供 Excel 文件路径作为参数"
)
sys
.
exit
(
1
)
# 获取Excel文件路径
excel_file_path
=
sys
.
argv
[
1
]
class
MyLogger
:
def
__init__
(
self
,
logger_name
,
log_file
,
console_handler
=
True
,
level
=
logging
.
INFO
):
self
.
logger_name
=
logger_name
self
.
log_file
=
log_file
self
.
vlog
=
logging
.
getLogger
(
logger_name
)
self
.
vlog
.
setLevel
(
level
)
# 读取Excel文件
df
=
pd
.
read_excel
(
excel_file_path
)
self
.
file_handler
=
logging
.
FileHandler
(
log_file
)
formatter
=
logging
.
Formatter
(
'%(asctime)s : %(message)s'
,
"%Y-%m-%d %H:%M:%S"
)
self
.
file_handler
.
setFormatter
(
formatter
)
self
.
vlog
.
addHandler
(
self
.
file_handler
)
# 确保结果文件夹存在
result_dir
=
"result"
os
.
makedirs
(
result_dir
,
exist_ok
=
True
)
if
console_handler
:
self
.
console_handler
=
logging
.
StreamHandler
()
self
.
console_handler
.
setFormatter
(
formatter
)
self
.
console_handler
.
setLevel
(
level
)
self
.
console_handler
.
setLevel
(
level
)
self
.
vlog
.
addHandler
(
self
.
console_handler
)
def
get_vlog
(
self
):
return
self
.
vlog
def
__del__
(
self
):
self
.
vlog
.
removeHandler
(
self
.
file_handler
)
if
self
.
console_handler
is
not
None
:
self
.
vlog
.
removeHandler
(
self
.
console_handler
)
log_file
=
"ok.txt"
# 定义日志文件的名称
max_retries
=
3
# 最大重试次数
retry_delay
=
5
# 重试前等待的秒数
# 定义一个用于打包和传输的函数
def
package_and_transfer
(
image_name
,
tar_file
,
image_result_dir
):
def
package_and_transfer
(
image_name
,
tar_file
,
image_result_dir
,
logger
):
# 打包镜像
save_commands
=
[
f
"sh script/save.sh
{
image_name
}
"
,
f
"sh script/save.sh
{
image_name
}
> /dev/null 2>&1
"
,
f
"mv
{
tar_file
}
{
image_result_dir
}
/"
]
for
save_command
in
save_commands
:
print
(
f
"打包镜像:
{
save_command
}
"
)
logger
.
info
(
f
"打包镜像:
{
save_command
}
"
)
subprocess
.
run
(
save_command
,
shell
=
True
)
print
(
f
"镜像
{
image_name
}
已成功打包
{
tar_file
}
"
)
logger
.
info
(
f
"镜像
{
image_name
}
已成功打包
{
tar_file
}
"
)
# 准备执行远程传输命令
recvlog_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
_recvlog"
rsync_command
=
f
'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"
{
image_result_dir
}
/
{
tar_file
}
openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ >
{
recvlog_file
}
'
rsync_command
=
f
'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"
{
image_result_dir
}
/
{
tar_file
}
{
args
.
des_path
}
>
{
recvlog_file
}
'
# 打印并执行 rsync 远程传输命令
print
(
f
"远程传输命令:
{
rsync_command
}
"
)
logger
.
info
(
f
"远程传输命令:
{
rsync_command
}
"
)
retries
=
0
while
retries
<
max_retries
:
while
retries
<
args
.
trans_retry_max_num
:
try
:
subprocess
.
run
(
rsync_command
,
shell
=
True
,
check
=
True
)
print
(
f
"镜像
{
tar_file
}
传输成功,日志保存到
{
recvlog_file
}
"
)
logger
.
info
(
f
"镜像
{
tar_file
}
传输成功,日志保存到
{
recvlog_file
}
"
)
# 传输成功后,将镜像名称追加到日志文件中
with
open
(
log
_file
,
"a"
)
as
log
:
with
open
(
args
.
ok
_file
,
"a"
)
as
log
:
log
.
write
(
f
"
{
image_name
}
成功传输
\n
"
)
# 传输成功后删除 .tar 文件
tar_file_path
=
os
.
path
.
join
(
image_result_dir
,
tar_file
)
if
os
.
path
.
exists
(
tar_file_path
):
os
.
remove
(
tar_file_path
)
print
(
f
"
{
tar_file_path
}
已删除"
)
logger
.
info
(
f
"
{
tar_file_path
}
已删除"
)
# 传输成功后删除 recvlog 文件
if
os
.
path
.
exists
(
recvlog_file
):
os
.
remove
(
recvlog_file
)
print
(
f
"
{
recvlog_file
}
已删除"
)
logger
.
info
(
f
"
{
recvlog_file
}
已删除"
)
break
# 成功后跳出重试循环
except
subprocess
.
CalledProcessError
:
retries
+=
1
print
(
f
"镜像
{
tar_file
}
传输失败,尝试重试
{
retries
}
/
{
max_retries
}
次"
)
if
retries
<
max_retries
:
time
.
sleep
(
retry_delay
)
# 等待一段时间再重试
logger
.
info
(
f
"镜像
{
tar_file
}
传输失败,尝试重试
{
retries
}
/
{
args
.
trans_retry_num
}
次"
)
if
retries
<
args
.
trans_retry_num
:
time
.
sleep
(
args
.
trans_
retry_delay
)
# 等待一段时间再重试
else
:
pr
in
t
(
f
"传输失败超过最大重试次数,跳过镜像
{
image_name
}
"
)
with
open
(
log
_file
,
"a"
)
as
log
:
logger
.
warn
in
g
(
f
"传输失败超过最大重试次数,跳过镜像
{
image_name
}
"
)
with
open
(
args
.
ok
_file
,
"a"
)
as
log
:
log
.
write
(
f
"
{
image_name
}
传输失败
\n
"
)
break
# 超过最大重试次数后,跳过这个镜像
print
(
f
"==== 镜像
{
image_name
}
传输完毕 ===="
)
# 创建线程池
with
ThreadPoolExecutor
()
as
executor
:
# 遍历每一行数据,自动构建镜像
for
index
,
row
in
df
.
iterrows
():
image_name
=
row
[
'镜像名'
]
base_image
=
row
[
'基础镜像'
]
framework_version
=
row
[
'框架版本'
]
# 直接获取框架版本作为 framework_VERSION
other_dependencies
=
row
[
'其他依赖包'
]
conda_url
=
row
[
'conda url'
]
# 获取conda URL
# 处理 NaN 情况:确保 base_image 是字符串
if
pd
.
isna
(
base_image
):
print
(
f
"基础镜像信息缺失,跳过该行:
{
image_name
}
"
)
continue
# 提取 torchvision 和 torchaudio 版本号
torchvision_version
=
None
torchaudio_version
=
None
if
pd
.
notna
(
other_dependencies
):
# 使用正则表达式提取torchvision和torchaudio版本
match_vision
=
re
.
search
(
r
'torchvision-([\d.]+)'
,
other_dependencies
)
match_audio
=
re
.
search
(
r
'torchaudio-([\d.]+)'
,
other_dependencies
)
if
match_vision
:
torchvision_version
=
match_vision
.
group
(
1
)
if
match_audio
:
torchaudio_version
=
match_audio
.
group
(
1
)
# 如果未找到torchvision或torchaudio的版本,默认设置为空
if
torchvision_version
is
None
:
torchvision_version
=
"未找到版本号"
if
torchaudio_version
is
None
:
torchaudio_version
=
"未找到版本号"
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if
isinstance
(
base_image
,
str
):
if
"pytorch"
in
image_name
:
if
"pytorch/pytorch"
in
base_image
:
# 构建 PyTorch 镜像的命令
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
"""
else
:
# 构建 NVIDIA 镜像的命令
logger
.
info
(
f
"==== 镜像
{
image_name
}
传输完毕 ===="
)
def
run
():
# 读取Excel文件
df
=
pd
.
read_excel
(
args
.
input_file
)
os
.
makedirs
(
args
.
log_dir
,
exist_ok
=
True
)
# 创建线程池
with
ThreadPoolExecutor
()
as
executor
:
# 遍历每一行数据,自动构建镜像
for
index
,
row
in
df
.
iterrows
():
image_name
=
row
[
'镜像名'
]
base_image
=
row
[
'基础镜像'
]
framework_version
=
row
[
'框架版本'
]
# 直接获取框架版本作为 framework_VERSION
other_dependencies
=
row
[
'其他依赖包'
]
conda_url
=
row
[
'conda url'
]
# 获取conda URL
# 日志文件
if
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
)):
shutil
.
rmtree
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
))
os
.
makedirs
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
))
my_logger
=
MyLogger
(
image_name
,
os
.
path
.
join
(
args
.
log_dir
,
image_name
,
"run.log"
))
logger
=
my_logger
.
get_vlog
()
# 处理 NaN 情况:确保 base_image 是字符串
if
pd
.
isna
(
base_image
):
logger
.
error
(
f
"基础镜像信息缺失,跳过该行:
{
image_name
}
"
)
continue
# 提取 torchvision 和 torchaudio 版本号
torchvision_version
=
None
torchaudio_version
=
None
if
pd
.
notna
(
other_dependencies
):
# 使用正则表达式提取torchvision和torchaudio版本
match_vision
=
re
.
search
(
r
'torchvision-([\d.]+)'
,
other_dependencies
)
match_audio
=
re
.
search
(
r
'torchaudio-([\d.]+)'
,
other_dependencies
)
if
match_vision
:
torchvision_version
=
match_vision
.
group
(
1
)
if
match_audio
:
torchaudio_version
=
match_audio
.
group
(
1
)
# 如果未找到torchvision或torchaudio的版本,默认设置为空
if
torchvision_version
is
None
or
torchaudio_version
is
None
:
torchvision_version
=
"未找到版本号"
if
torchaudio_version
is
None
:
torchaudio_version
=
"未找到版本号"
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if
isinstance
(
base_image
,
str
):
if
"pytorch"
in
image_name
:
if
"pytorch/pytorch"
in
base_image
:
# 构建 PyTorch 镜像的命令
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
"""
else
:
# 构建 NVIDIA 镜像的命令
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TORCH_VERSION="
{
framework_version
}
"
\
TORCHVISION_VERSION="
{
torchvision_version
}
"
\
TORCHAUDIO_VERSION="
{
torchaudio_version
}
"
\
CONDA_URL="
{
conda_url
}
"
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
"""
elif
"tensorflow"
in
image_name
:
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TORCH_VERSION="
{
framework_version
}
"
\
TORCHVISION_VERSION="
{
torchvision_version
}
"
\
TORCHAUDIO_VERSION="
{
torchaudio_version
}
"
\
CONDA_URL="
{
conda_url
}
"
TENSORFLOW_VERSION="
{
framework_version
}
"
\
CONDA_URL="
{
conda_url
}
"
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
"""
elif
"tensorflow"
in
image_name
:
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TENSORFLOW_VERSION="
{
framework_version
}
"
\
CONDA_URL="
{
conda_url
}
"
"""
# 打印构建命令(用于调试)
print
(
build_command
)
# 执行构建命令,捕获异常
try
:
print
(
f
"==== 镜像
{
image_name
}
开始构建 ===="
)
subprocess
.
run
(
build_command
,
shell
=
True
,
check
=
True
)
except
subprocess
.
CalledProcessError
:
print
(
f
"==== 镜像
{
image_name
}
构建失败,跳过该镜像 ===="
)
continue
# 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir
=
os
.
path
.
join
(
result_dir
,
image_name
.
replace
(
'/'
,
'_'
))
# 如果目录已存在,先删除再创建
if
os
.
path
.
exists
(
image_result_dir
):
shutil
.
rmtree
(
image_result_dir
)
# 重新创建目录
os
.
makedirs
(
image_result_dir
,
exist_ok
=
True
)
# 执行测试并将日志保存到相应的目录
test_commands
=
[
f
"sh script/1_base_test.sh
{
image_name
}
>
{
image_result_dir
}
/1_base_test.log 2>&1"
,
f
"sh script/2_text_test.sh
{
image_name
}
>
{
image_result_dir
}
/2_text_test.log 2>&1"
,
f
"sh script/3_image_test.sh
{
image_name
}
>
{
image_result_dir
}
/3_image_test.log 2>&1"
,
]
if
"pytorch"
in
image_name
:
test_commands
.
append
(
f
"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png
{
image_result_dir
}
"
)
# 执行测试命令
for
test_command
in
test_commands
:
print
(
f
"执行测试:
{
test_command
}
"
)
subprocess
.
run
(
test_command
,
shell
=
True
)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
.tar"
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor
.
submit
(
package_and_transfer
,
image_name
,
tar_file
,
image_result_dir
)
# 打印构建命令(用于调试)
logger
.
info
(
build_command
)
# 执行构建命令,捕获异常
try
:
logger
.
info
(
f
"==== 镜像
{
image_name
}
开始构建 ===="
)
subprocess
.
run
(
build_command
,
shell
=
True
,
check
=
True
)
except
subprocess
.
CalledProcessError
:
logger
.
info
(
f
"==== 镜像
{
image_name
}
构建失败,跳过该镜像 ===="
)
continue
# 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir
=
os
.
path
.
join
(
args
.
log_dir
,
image_name
)
# 执行测试并将日志保存到相应的目录
test_commands
=
[
f
"sh script/1_base_test.sh
{
image_name
}
>
{
image_result_dir
}
/1_base_test.log 2>&1"
,
f
"sh script/2_text_test.sh
{
image_name
}
>
{
image_result_dir
}
/2_text_test.log 2>&1"
,
f
"sh script/3_image_test.sh
{
image_name
}
>
{
image_result_dir
}
/3_image_test.log 2>&1"
,
]
if
"pytorch"
in
image_name
:
test_commands
.
append
(
f
"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png
{
image_result_dir
}
"
)
# # 执行测试命令
# for test_command in test_commands:
# logger.info(f"执行测试: {test_command}")
# subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
.tar"
if
not
args
.
no_save_trans
:
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor
.
submit
(
package_and_transfer
,
image_name
,
tar_file
,
image_result_dir
,
logger
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Autobuild images from a excel file.'
)
parser
.
add_argument
(
'--input-file'
,
type
=
str
,
default
=
"input.xlsx"
,
required
=
True
,
help
=
'a excel file with images to build.'
)
parser
.
add_argument
(
'--index'
,
type
=
str
,
help
=
'the indexes for images to build, separated by ","'
)
parser
.
add_argument
(
'--num'
,
type
=
int
,
help
=
'the number of images to build'
)
parser
.
add_argument
(
'--log-dir'
,
type
=
str
,
default
=
"logs"
,
help
=
'logs directory'
)
parser
.
add_argument
(
'--ok-file'
,
type
=
str
,
default
=
"ok.txt"
,
help
=
'the file of succeed images'
)
parser
.
add_argument
(
'--trans-retry-max-num'
,
type
=
int
,
default
=
3
,
help
=
'transform retry max num'
)
parser
.
add_argument
(
'--trans-retry-delay'
,
type
=
int
,
default
=
5
,
help
=
'transform delay seconds'
)
parser
.
add_argument
(
'--des-path'
,
type
=
str
,
default
=
"openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/"
,
help
=
'destination path in scnet'
)
parser
.
add_argument
(
"--no-save-trans"
,
action
=
"store_true"
,
help
=
"do not save and transform image"
)
args
=
parser
.
parse_args
()
run
()
build_space/Dockerfile.jupyterlab_ubuntu
View file @
d445a280
...
...
@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc; \
el
if [ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir protobuf==3.20.*; \
el
if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*; \
el
if [ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ]|| [ $TENSORFLOW_VERSION == "2.9.3" ]; then \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;
fi &&
\
if
[ $TENSORFLOW_VERSION == "2.8.0" ] ||
[ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir protobuf==3.20.*;
fi &&
\
if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION ==
"2.5.0" ] || [ $TENSORFLOW_VERSION ==
"2.6.0" ]; then \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;
fi &&
\
if
[ $TENSORFLOW_VERSION == "2.8.0" ] ||
[ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ]
|| [ $TENSORFLOW_VERSION == "2.9.3"
] || [ $TENSORFLOW_VERSION == "2.14.0"
]; then \
pip install --no-cache-dir "numpy<2"; fi
# ----- paddlepaddle install -----
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment