Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
gpu-base-image-build
Commits
d445a280
Commit
d445a280
authored
Oct 29, 2024
by
chenpangpang
Browse files
feat: 稳定版本的tensorflow分支
parent
5e6e34ed
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
181 additions
and
140 deletions
+181
-140
auto_build.py
auto_build.py
+175
-134
build_space/Dockerfile.jupyterlab_ubuntu
build_space/Dockerfile.jupyterlab_ubuntu
+6
-6
No files found.
auto_build.py
View file @
d445a280
...
@@ -2,187 +2,228 @@ import pandas as pd
...
@@ -2,187 +2,228 @@ import pandas as pd
import
re
import
re
import
subprocess
import
subprocess
import
os
import
os
import
sys
import
shutil
import
shutil
import
time
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
concurrent.futures
import
ThreadPoolExecutor
,
wait
,
ALL_COMPLETED
import
argparse
import
logging
# 检查命令行参数,确保提供了 Excel 文件路径
if
len
(
sys
.
argv
)
<
2
:
print
(
"请提供 Excel 文件路径作为参数"
)
sys
.
exit
(
1
)
# 获取Excel文件路径
class
MyLogger
:
excel_file_path
=
sys
.
argv
[
1
]
def
__init__
(
self
,
logger_name
,
log_file
,
console_handler
=
True
,
level
=
logging
.
INFO
):
self
.
logger_name
=
logger_name
self
.
log_file
=
log_file
self
.
vlog
=
logging
.
getLogger
(
logger_name
)
self
.
vlog
.
setLevel
(
level
)
# 读取Excel文件
self
.
file_handler
=
logging
.
FileHandler
(
log_file
)
df
=
pd
.
read_excel
(
excel_file_path
)
formatter
=
logging
.
Formatter
(
'%(asctime)s : %(message)s'
,
"%Y-%m-%d %H:%M:%S"
)
self
.
file_handler
.
setFormatter
(
formatter
)
self
.
vlog
.
addHandler
(
self
.
file_handler
)
# 确保结果文件夹存在
if
console_handler
:
result_dir
=
"result"
self
.
console_handler
=
logging
.
StreamHandler
()
os
.
makedirs
(
result_dir
,
exist_ok
=
True
)
self
.
console_handler
.
setFormatter
(
formatter
)
self
.
console_handler
.
setLevel
(
level
)
self
.
console_handler
.
setLevel
(
level
)
self
.
vlog
.
addHandler
(
self
.
console_handler
)
def
get_vlog
(
self
):
return
self
.
vlog
def
__del__
(
self
):
self
.
vlog
.
removeHandler
(
self
.
file_handler
)
if
self
.
console_handler
is
not
None
:
self
.
vlog
.
removeHandler
(
self
.
console_handler
)
log_file
=
"ok.txt"
# 定义日志文件的名称
max_retries
=
3
# 最大重试次数
retry_delay
=
5
# 重试前等待的秒数
# 定义一个用于打包和传输的函数
# 定义一个用于打包和传输的函数
def
package_and_transfer
(
image_name
,
tar_file
,
image_result_dir
):
def
package_and_transfer
(
image_name
,
tar_file
,
image_result_dir
,
logger
):
# 打包镜像
# 打包镜像
save_commands
=
[
save_commands
=
[
f
"sh script/save.sh
{
image_name
}
"
,
f
"sh script/save.sh
{
image_name
}
> /dev/null 2>&1
"
,
f
"mv
{
tar_file
}
{
image_result_dir
}
/"
f
"mv
{
tar_file
}
{
image_result_dir
}
/"
]
]
for
save_command
in
save_commands
:
for
save_command
in
save_commands
:
print
(
f
"打包镜像:
{
save_command
}
"
)
logger
.
info
(
f
"打包镜像:
{
save_command
}
"
)
subprocess
.
run
(
save_command
,
shell
=
True
)
subprocess
.
run
(
save_command
,
shell
=
True
)
print
(
f
"镜像
{
image_name
}
已成功打包
{
tar_file
}
"
)
logger
.
info
(
f
"镜像
{
image_name
}
已成功打包
{
tar_file
}
"
)
# 准备执行远程传输命令
# 准备执行远程传输命令
recvlog_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
_recvlog"
recvlog_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
_recvlog"
rsync_command
=
f
'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"
{
image_result_dir
}
/
{
tar_file
}
openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ >
{
recvlog_file
}
'
rsync_command
=
f
'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"
{
image_result_dir
}
/
{
tar_file
}
{
args
.
des_path
}
>
{
recvlog_file
}
'
# 打印并执行 rsync 远程传输命令
# 打印并执行 rsync 远程传输命令
print
(
f
"远程传输命令:
{
rsync_command
}
"
)
logger
.
info
(
f
"远程传输命令:
{
rsync_command
}
"
)
retries
=
0
retries
=
0
while
retries
<
max_retries
:
while
retries
<
args
.
trans_retry_max_num
:
try
:
try
:
subprocess
.
run
(
rsync_command
,
shell
=
True
,
check
=
True
)
subprocess
.
run
(
rsync_command
,
shell
=
True
,
check
=
True
)
print
(
f
"镜像
{
tar_file
}
传输成功,日志保存到
{
recvlog_file
}
"
)
logger
.
info
(
f
"镜像
{
tar_file
}
传输成功,日志保存到
{
recvlog_file
}
"
)
# 传输成功后,将镜像名称追加到日志文件中
# 传输成功后,将镜像名称追加到日志文件中
with
open
(
log
_file
,
"a"
)
as
log
:
with
open
(
args
.
ok
_file
,
"a"
)
as
log
:
log
.
write
(
f
"
{
image_name
}
成功传输
\n
"
)
log
.
write
(
f
"
{
image_name
}
成功传输
\n
"
)
# 传输成功后删除 .tar 文件
# 传输成功后删除 .tar 文件
tar_file_path
=
os
.
path
.
join
(
image_result_dir
,
tar_file
)
tar_file_path
=
os
.
path
.
join
(
image_result_dir
,
tar_file
)
if
os
.
path
.
exists
(
tar_file_path
):
if
os
.
path
.
exists
(
tar_file_path
):
os
.
remove
(
tar_file_path
)
os
.
remove
(
tar_file_path
)
print
(
f
"
{
tar_file_path
}
已删除"
)
logger
.
info
(
f
"
{
tar_file_path
}
已删除"
)
# 传输成功后删除 recvlog 文件
# 传输成功后删除 recvlog 文件
if
os
.
path
.
exists
(
recvlog_file
):
if
os
.
path
.
exists
(
recvlog_file
):
os
.
remove
(
recvlog_file
)
os
.
remove
(
recvlog_file
)
print
(
f
"
{
recvlog_file
}
已删除"
)
logger
.
info
(
f
"
{
recvlog_file
}
已删除"
)
break
# 成功后跳出重试循环
break
# 成功后跳出重试循环
except
subprocess
.
CalledProcessError
:
except
subprocess
.
CalledProcessError
:
retries
+=
1
retries
+=
1
print
(
f
"镜像
{
tar_file
}
传输失败,尝试重试
{
retries
}
/
{
max_retries
}
次"
)
logger
.
info
(
f
"镜像
{
tar_file
}
传输失败,尝试重试
{
retries
}
/
{
args
.
trans_retry_num
}
次"
)
if
retries
<
max_retries
:
if
retries
<
args
.
trans_retry_num
:
time
.
sleep
(
retry_delay
)
# 等待一段时间再重试
time
.
sleep
(
args
.
trans_
retry_delay
)
# 等待一段时间再重试
else
:
else
:
pr
in
t
(
f
"传输失败超过最大重试次数,跳过镜像
{
image_name
}
"
)
logger
.
warn
in
g
(
f
"传输失败超过最大重试次数,跳过镜像
{
image_name
}
"
)
with
open
(
log
_file
,
"a"
)
as
log
:
with
open
(
args
.
ok
_file
,
"a"
)
as
log
:
log
.
write
(
f
"
{
image_name
}
传输失败
\n
"
)
log
.
write
(
f
"
{
image_name
}
传输失败
\n
"
)
break
# 超过最大重试次数后,跳过这个镜像
break
# 超过最大重试次数后,跳过这个镜像
print
(
f
"==== 镜像
{
image_name
}
传输完毕 ===="
)
logger
.
info
(
f
"==== 镜像
{
image_name
}
传输完毕 ===="
)
# 创建线程池
def
run
():
with
ThreadPoolExecutor
()
as
executor
:
# 读取Excel文件
# 遍历每一行数据,自动构建镜像
df
=
pd
.
read_excel
(
args
.
input_file
)
for
index
,
row
in
df
.
iterrows
():
os
.
makedirs
(
args
.
log_dir
,
exist_ok
=
True
)
image_name
=
row
[
'镜像名'
]
base_image
=
row
[
'基础镜像'
]
# 创建线程池
framework_version
=
row
[
'框架版本'
]
# 直接获取框架版本作为 framework_VERSION
with
ThreadPoolExecutor
()
as
executor
:
other_dependencies
=
row
[
'其他依赖包'
]
# 遍历每一行数据,自动构建镜像
conda_url
=
row
[
'conda url'
]
# 获取conda URL
for
index
,
row
in
df
.
iterrows
():
image_name
=
row
[
'镜像名'
]
# 处理 NaN 情况:确保 base_image 是字符串
base_image
=
row
[
'基础镜像'
]
if
pd
.
isna
(
base_image
):
framework_version
=
row
[
'框架版本'
]
# 直接获取框架版本作为 framework_VERSION
print
(
f
"基础镜像信息缺失,跳过该行:
{
image_name
}
"
)
other_dependencies
=
row
[
'其他依赖包'
]
continue
conda_url
=
row
[
'conda url'
]
# 获取conda URL
# 提取 torchvision 和 torchaudio 版本号
# 日志文件
torchvision_version
=
None
if
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
)):
torchaudio_version
=
None
shutil
.
rmtree
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
))
if
pd
.
notna
(
other_dependencies
):
os
.
makedirs
(
os
.
path
.
join
(
args
.
log_dir
,
image_name
))
# 使用正则表达式提取torchvision和torchaudio版本
my_logger
=
MyLogger
(
image_name
,
os
.
path
.
join
(
args
.
log_dir
,
image_name
,
"run.log"
))
match_vision
=
re
.
search
(
r
'torchvision-([\d.]+)'
,
other_dependencies
)
logger
=
my_logger
.
get_vlog
()
match_audio
=
re
.
search
(
r
'torchaudio-([\d.]+)'
,
other_dependencies
)
# 处理 NaN 情况:确保 base_image 是字符串
if
match_vision
:
if
pd
.
isna
(
base_image
):
torchvision_version
=
match_vision
.
group
(
1
)
logger
.
error
(
f
"基础镜像信息缺失,跳过该行:
{
image_name
}
"
)
if
match_audio
:
continue
torchaudio_version
=
match_audio
.
group
(
1
)
# 提取 torchvision 和 torchaudio 版本号
# 如果未找到torchvision或torchaudio的版本,默认设置为空
torchvision_version
=
None
if
torchvision_version
is
None
:
torchaudio_version
=
None
torchvision_version
=
"未找到版本号"
if
pd
.
notna
(
other_dependencies
):
if
torchaudio_version
is
None
:
# 使用正则表达式提取torchvision和torchaudio版本
torchaudio_version
=
"未找到版本号"
match_vision
=
re
.
search
(
r
'torchvision-([\d.]+)'
,
other_dependencies
)
match_audio
=
re
.
search
(
r
'torchaudio-([\d.]+)'
,
other_dependencies
)
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if
match_vision
:
if
isinstance
(
base_image
,
str
):
torchvision_version
=
match_vision
.
group
(
1
)
if
"pytorch"
in
image_name
:
if
match_audio
:
if
"pytorch/pytorch"
in
base_image
:
torchaudio_version
=
match_audio
.
group
(
1
)
# 构建 PyTorch 镜像的命令
build_command
=
f
"""
# 如果未找到torchvision或torchaudio的版本,默认设置为空
cd build_space &&
\
if
torchvision_version
is
None
or
torchaudio_version
is
None
:
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
torchvision_version
=
"未找到版本号"
"""
if
torchaudio_version
is
None
:
else
:
torchaudio_version
=
"未找到版本号"
# 构建 NVIDIA 镜像的命令
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if
isinstance
(
base_image
,
str
):
if
"pytorch"
in
image_name
:
if
"pytorch/pytorch"
in
base_image
:
# 构建 PyTorch 镜像的命令
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
"""
else
:
# 构建 NVIDIA 镜像的命令
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TORCH_VERSION="
{
framework_version
}
"
\
TORCHVISION_VERSION="
{
torchvision_version
}
"
\
TORCHAUDIO_VERSION="
{
torchaudio_version
}
"
\
CONDA_URL="
{
conda_url
}
"
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
"""
elif
"tensorflow"
in
image_name
:
build_command
=
f
"""
build_command
=
f
"""
cd build_space &&
\
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TORCH_VERSION="
{
framework_version
}
"
\
TENSORFLOW_VERSION="
{
framework_version
}
"
\
TORCHVISION_VERSION="
{
torchvision_version
}
"
\
CONDA_URL="
{
conda_url
}
"
\
TORCHAUDIO_VERSION="
{
torchaudio_version
}
"
\
2>&1 | tee ../
{
args
.
log_dir
}
/
{
image_name
}
/build.log
CONDA_URL="
{
conda_url
}
"
"""
"""
elif
"tensorflow"
in
image_name
:
build_command
=
f
"""
cd build_space &&
\
./build_ubuntu.sh jupyterlab
{
image_name
}
{
base_image
}
\
TENSORFLOW_VERSION="
{
framework_version
}
"
\
CONDA_URL="
{
conda_url
}
"
"""
# 打印构建命令(用于调试)
print
(
build_command
)
# 执行构建命令,捕获异常
try
:
print
(
f
"==== 镜像
{
image_name
}
开始构建 ===="
)
subprocess
.
run
(
build_command
,
shell
=
True
,
check
=
True
)
except
subprocess
.
CalledProcessError
:
print
(
f
"==== 镜像
{
image_name
}
构建失败,跳过该镜像 ===="
)
continue
# 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir
=
os
.
path
.
join
(
result_dir
,
image_name
.
replace
(
'/'
,
'_'
))
# 如果目录已存在,先删除再创建
if
os
.
path
.
exists
(
image_result_dir
):
shutil
.
rmtree
(
image_result_dir
)
# 重新创建目录
os
.
makedirs
(
image_result_dir
,
exist_ok
=
True
)
# 执行测试并将日志保存到相应的目录
test_commands
=
[
f
"sh script/1_base_test.sh
{
image_name
}
>
{
image_result_dir
}
/1_base_test.log 2>&1"
,
f
"sh script/2_text_test.sh
{
image_name
}
>
{
image_result_dir
}
/2_text_test.log 2>&1"
,
f
"sh script/3_image_test.sh
{
image_name
}
>
{
image_result_dir
}
/3_image_test.log 2>&1"
,
]
if
"pytorch"
in
image_name
:
test_commands
.
append
(
f
"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png
{
image_result_dir
}
"
)
# 执行测试命令
for
test_command
in
test_commands
:
print
(
f
"执行测试:
{
test_command
}
"
)
subprocess
.
run
(
test_command
,
shell
=
True
)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
.tar"
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor
.
submit
(
package_and_transfer
,
image_name
,
tar_file
,
image_result_dir
)
# 打印构建命令(用于调试)
logger
.
info
(
build_command
)
# 执行构建命令,捕获异常
try
:
logger
.
info
(
f
"==== 镜像
{
image_name
}
开始构建 ===="
)
subprocess
.
run
(
build_command
,
shell
=
True
,
check
=
True
)
except
subprocess
.
CalledProcessError
:
logger
.
info
(
f
"==== 镜像
{
image_name
}
构建失败,跳过该镜像 ===="
)
continue
# 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir
=
os
.
path
.
join
(
args
.
log_dir
,
image_name
)
# 执行测试并将日志保存到相应的目录
test_commands
=
[
f
"sh script/1_base_test.sh
{
image_name
}
>
{
image_result_dir
}
/1_base_test.log 2>&1"
,
f
"sh script/2_text_test.sh
{
image_name
}
>
{
image_result_dir
}
/2_text_test.log 2>&1"
,
f
"sh script/3_image_test.sh
{
image_name
}
>
{
image_result_dir
}
/3_image_test.log 2>&1"
,
]
if
"pytorch"
in
image_name
:
test_commands
.
append
(
f
"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png
{
image_result_dir
}
"
)
# # 执行测试命令
# for test_command in test_commands:
# logger.info(f"执行测试: {test_command}")
# subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file
=
f
"
{
image_name
.
replace
(
':'
,
'-'
)
}
.tar"
if
not
args
.
no_save_trans
:
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor
.
submit
(
package_and_transfer
,
image_name
,
tar_file
,
image_result_dir
,
logger
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Autobuild images from a excel file.'
)
parser
.
add_argument
(
'--input-file'
,
type
=
str
,
default
=
"input.xlsx"
,
required
=
True
,
help
=
'a excel file with images to build.'
)
parser
.
add_argument
(
'--index'
,
type
=
str
,
help
=
'the indexes for images to build, separated by ","'
)
parser
.
add_argument
(
'--num'
,
type
=
int
,
help
=
'the number of images to build'
)
parser
.
add_argument
(
'--log-dir'
,
type
=
str
,
default
=
"logs"
,
help
=
'logs directory'
)
parser
.
add_argument
(
'--ok-file'
,
type
=
str
,
default
=
"ok.txt"
,
help
=
'the file of succeed images'
)
parser
.
add_argument
(
'--trans-retry-max-num'
,
type
=
int
,
default
=
3
,
help
=
'transform retry max num'
)
parser
.
add_argument
(
'--trans-retry-delay'
,
type
=
int
,
default
=
5
,
help
=
'transform delay seconds'
)
parser
.
add_argument
(
'--des-path'
,
type
=
str
,
default
=
"openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/"
,
help
=
'destination path in scnet'
)
parser
.
add_argument
(
"--no-save-trans"
,
action
=
"store_true"
,
help
=
"do not save and transform image"
)
args
=
parser
.
parse_args
()
run
()
build_space/Dockerfile.jupyterlab_ubuntu
View file @
d445a280
...
@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
...
@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc; \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;
fi &&
\
el
if [ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
if
[ $TENSORFLOW_VERSION == "2.8.0" ] ||
[ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir protobuf==3.20.*; \
pip install --no-cache-dir protobuf==3.20.*;
fi &&
\
el
if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION ==
"2.5.0" ] || [ $TENSORFLOW_VERSION ==
"2.6.0" ]; then \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*; \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;
fi &&
\
el
if [ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ]|| [ $TENSORFLOW_VERSION == "2.9.3" ]; then \
if
[ $TENSORFLOW_VERSION == "2.8.0" ] ||
[ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ]
|| [ $TENSORFLOW_VERSION == "2.9.3"
] || [ $TENSORFLOW_VERSION == "2.14.0"
]; then \
pip install --no-cache-dir "numpy<2"; fi
pip install --no-cache-dir "numpy<2"; fi
# ----- paddlepaddle install -----
# ----- paddlepaddle install -----
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment