Commit d445a280 authored by chenpangpang's avatar chenpangpang
Browse files

feat: 稳定版本的tensorflow分支

parent 5e6e34ed
...@@ -2,187 +2,228 @@ import pandas as pd ...@@ -2,187 +2,228 @@ import pandas as pd
import re import re
import subprocess import subprocess
import os import os
import sys
import shutil import shutil
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import argparse
import logging
# 检查命令行参数,确保提供了 Excel 文件路径
if len(sys.argv) < 2:
print("请提供 Excel 文件路径作为参数")
sys.exit(1)
# 获取Excel文件路径 class MyLogger:
excel_file_path = sys.argv[1] def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
self.logger_name = logger_name
self.log_file = log_file
self.vlog = logging.getLogger(logger_name)
self.vlog.setLevel(level)
# 读取Excel文件 self.file_handler = logging.FileHandler(log_file)
df = pd.read_excel(excel_file_path) formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
self.file_handler.setFormatter(formatter)
self.vlog.addHandler(self.file_handler)
# 确保结果文件夹存在 if console_handler:
result_dir = "result" self.console_handler = logging.StreamHandler()
os.makedirs(result_dir, exist_ok=True) self.console_handler.setFormatter(formatter)
self.console_handler.setLevel(level)
self.console_handler.setLevel(level)
self.vlog.addHandler(self.console_handler)
def get_vlog(self):
return self.vlog
def __del__(self):
self.vlog.removeHandler(self.file_handler)
if self.console_handler is not None:
self.vlog.removeHandler(self.console_handler)
log_file = "ok.txt" # 定义日志文件的名称
max_retries = 3 # 最大重试次数
retry_delay = 5 # 重试前等待的秒数
# 定义一个用于打包和传输的函数 # 定义一个用于打包和传输的函数
def package_and_transfer(image_name, tar_file, image_result_dir): def package_and_transfer(image_name, tar_file, image_result_dir, logger):
# 打包镜像 # 打包镜像
save_commands = [ save_commands = [
f"sh script/save.sh {image_name}", f"sh script/save.sh {image_name} > /dev/null 2>&1",
f"mv {tar_file} {image_result_dir}/" f"mv {tar_file} {image_result_dir}/"
] ]
for save_command in save_commands: for save_command in save_commands:
print(f"打包镜像: {save_command}") logger.info(f"打包镜像: {save_command}")
subprocess.run(save_command, shell=True) subprocess.run(save_command, shell=True)
print(f"镜像 {image_name} 已成功打包 {tar_file}") logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")
# 准备执行远程传输命令 # 准备执行远程传输命令
recvlog_file = f"{image_name.replace(':', '-')}_recvlog" recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no" {image_result_dir}/{tar_file} openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ > {recvlog_file}' rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no" {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
# 打印并执行 rsync 远程传输命令 # 打印并执行 rsync 远程传输命令
print(f"远程传输命令: {rsync_command}") logger.info(f"远程传输命令: {rsync_command}")
retries = 0 retries = 0
while retries < max_retries: while retries < args.trans_retry_max_num:
try: try:
subprocess.run(rsync_command, shell=True, check=True) subprocess.run(rsync_command, shell=True, check=True)
print(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}") logger.info(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}")
# 传输成功后,将镜像名称追加到日志文件中 # 传输成功后,将镜像名称追加到日志文件中
with open(log_file, "a") as log: with open(args.ok_file, "a") as log:
log.write(f"{image_name} 成功传输\n") log.write(f"{image_name} 成功传输\n")
# 传输成功后删除 .tar 文件 # 传输成功后删除 .tar 文件
tar_file_path = os.path.join(image_result_dir, tar_file) tar_file_path = os.path.join(image_result_dir, tar_file)
if os.path.exists(tar_file_path): if os.path.exists(tar_file_path):
os.remove(tar_file_path) os.remove(tar_file_path)
print(f"{tar_file_path} 已删除") logger.info(f"{tar_file_path} 已删除")
# 传输成功后删除 recvlog 文件 # 传输成功后删除 recvlog 文件
if os.path.exists(recvlog_file): if os.path.exists(recvlog_file):
os.remove(recvlog_file) os.remove(recvlog_file)
print(f"{recvlog_file} 已删除") logger.info(f"{recvlog_file} 已删除")
break # 成功后跳出重试循环 break # 成功后跳出重试循环
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
retries += 1 retries += 1
print(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{max_retries} 次") logger.info(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{args.trans_retry_num} 次")
if retries < max_retries: if retries < args.trans_retry_num:
time.sleep(retry_delay) # 等待一段时间再重试 time.sleep(args.trans_retry_delay) # 等待一段时间再重试
else: else:
print(f"传输失败超过最大重试次数,跳过镜像 {image_name}") logger.warning(f"传输失败超过最大重试次数,跳过镜像 {image_name}")
with open(log_file, "a") as log: with open(args.ok_file, "a") as log:
log.write(f"{image_name} 传输失败\n") log.write(f"{image_name} 传输失败\n")
break # 超过最大重试次数后,跳过这个镜像 break # 超过最大重试次数后,跳过这个镜像
print(f"==== 镜像 {image_name} 传输完毕 ====") logger.info(f"==== 镜像 {image_name} 传输完毕 ====")
# 创建线程池 def run():
with ThreadPoolExecutor() as executor: # 读取Excel文件
# 遍历每一行数据,自动构建镜像 df = pd.read_excel(args.input_file)
for index, row in df.iterrows(): os.makedirs(args.log_dir, exist_ok=True)
image_name = row['镜像名']
base_image = row['基础镜像'] # 创建线程池
framework_version = row['框架版本'] # 直接获取框架版本作为 framework_VERSION with ThreadPoolExecutor() as executor:
other_dependencies = row['其他依赖包'] # 遍历每一行数据,自动构建镜像
conda_url = row['conda url'] # 获取conda URL for index, row in df.iterrows():
image_name = row['镜像名']
# 处理 NaN 情况:确保 base_image 是字符串 base_image = row['基础镜像']
if pd.isna(base_image): framework_version = row['框架版本'] # 直接获取框架版本作为 framework_VERSION
print(f"基础镜像信息缺失,跳过该行: {image_name}") other_dependencies = row['其他依赖包']
continue conda_url = row['conda url'] # 获取conda URL
# 提取 torchvision 和 torchaudio 版本号 # 日志文件
torchvision_version = None if os.path.exists(os.path.join(args.log_dir, image_name)):
torchaudio_version = None shutil.rmtree(os.path.join(args.log_dir, image_name))
if pd.notna(other_dependencies): os.makedirs(os.path.join(args.log_dir, image_name))
# 使用正则表达式提取torchvision和torchaudio版本 my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies) logger = my_logger.get_vlog()
match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies) # 处理 NaN 情况:确保 base_image 是字符串
if match_vision: if pd.isna(base_image):
torchvision_version = match_vision.group(1) logger.error(f"基础镜像信息缺失,跳过该行: {image_name}")
if match_audio: continue
torchaudio_version = match_audio.group(1)
# 提取 torchvision 和 torchaudio 版本号
# 如果未找到torchvision或torchaudio的版本,默认设置为空 torchvision_version = None
if torchvision_version is None: torchaudio_version = None
torchvision_version = "未找到版本号" if pd.notna(other_dependencies):
if torchaudio_version is None: # 使用正则表达式提取torchvision和torchaudio版本
torchaudio_version = "未找到版本号" match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑 if match_vision:
if isinstance(base_image, str): torchvision_version = match_vision.group(1)
if "pytorch" in image_name: if match_audio:
if "pytorch/pytorch" in base_image: torchaudio_version = match_audio.group(1)
# 构建 PyTorch 镜像的命令
build_command = f""" # 如果未找到torchvision或torchaudio的版本,默认设置为空
cd build_space && \ if torchvision_version is None or torchaudio_version is None:
./build_ubuntu.sh jupyterlab {image_name} {base_image} torchvision_version = "未找到版本号"
""" if torchaudio_version is None:
else: torchaudio_version = "未找到版本号"
# 构建 NVIDIA 镜像的命令
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if isinstance(base_image, str):
if "pytorch" in image_name:
if "pytorch/pytorch" in base_image:
# 构建 PyTorch 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
else:
# 构建 NVIDIA 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TORCH_VERSION="{framework_version}" \
TORCHVISION_VERSION="{torchvision_version}" \
TORCHAUDIO_VERSION="{torchaudio_version}" \
CONDA_URL="{conda_url}" \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
elif "tensorflow" in image_name:
build_command = f""" build_command = f"""
cd build_space && \ cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \ ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TORCH_VERSION="{framework_version}" \ TENSORFLOW_VERSION="{framework_version}" \
TORCHVISION_VERSION="{torchvision_version}" \ CONDA_URL="{conda_url}" \
TORCHAUDIO_VERSION="{torchaudio_version}" \ 2>&1 | tee ../{args.log_dir}/{image_name}/build.log
CONDA_URL="{conda_url}"
""" """
elif "tensorflow" in image_name:
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TENSORFLOW_VERSION="{framework_version}" \
CONDA_URL="{conda_url}"
"""
# 打印构建命令(用于调试)
print(build_command)
# 执行构建命令,捕获异常
try:
print(f"==== 镜像 {image_name} 开始构建 ====")
subprocess.run(build_command, shell=True, check=True)
except subprocess.CalledProcessError:
print(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
continue # 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir = os.path.join(result_dir, image_name.replace('/', '_'))
# 如果目录已存在,先删除再创建
if os.path.exists(image_result_dir):
shutil.rmtree(image_result_dir)
# 重新创建目录
os.makedirs(image_result_dir, exist_ok=True)
# 执行测试并将日志保存到相应的目录
test_commands = [
f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
]
if "pytorch" in image_name:
test_commands.append(f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
# 执行测试命令
for test_command in test_commands:
print(f"执行测试: {test_command}")
subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file = f"{image_name.replace(':', '-')}.tar"
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor.submit(package_and_transfer, image_name, tar_file, image_result_dir)
# 打印构建命令(用于调试)
logger.info(build_command)
# 执行构建命令,捕获异常
try:
logger.info(f"==== 镜像 {image_name} 开始构建 ====")
subprocess.run(build_command, shell=True, check=True)
except subprocess.CalledProcessError:
logger.info(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
continue # 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir = os.path.join(args.log_dir, image_name)
# 执行测试并将日志保存到相应的目录
test_commands = [
f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
]
if "pytorch" in image_name:
test_commands.append(
f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
# # 执行测试命令
# for test_command in test_commands:
# logger.info(f"执行测试: {test_command}")
# subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file = f"{image_name.replace(':', '-')}.tar"
if not args.no_save_trans:
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
help='a excel file with images to build.')
parser.add_argument('--index', type=str,
help='the indexes for images to build, separated by ","')
parser.add_argument('--num', type=int,
help='the number of images to build')
parser.add_argument('--log-dir', type=str, default="logs",
help='logs directory')
parser.add_argument('--ok-file', type=str, default="ok.txt",
help='the file of succeed images')
parser.add_argument('--trans-retry-max-num', type=int, default=3,
help='transform retry max num')
parser.add_argument('--trans-retry-delay', type=int, default=5,
help='transform delay seconds')
parser.add_argument('--des-path', type=str,
default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
help='destination path in scnet')
parser.add_argument("--no-save-trans", action="store_true",
help="do not save and transform image")
args = parser.parse_args()
run()
...@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \ ...@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \ python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \ CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \ echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc; \ echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;fi && \
elif [ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \ if [ $TENSORFLOW_VERSION == "2.8.0" ] || [ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir protobuf==3.20.*; \ pip install --no-cache-dir protobuf==3.20.*;fi && \
elif [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \ if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION == "2.5.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*; \ pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;fi && \
elif [ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ]|| [ $TENSORFLOW_VERSION == "2.9.3" ]; then \ if [ $TENSORFLOW_VERSION == "2.8.0" ] || [ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ] || [ $TENSORFLOW_VERSION == "2.9.3" ] || [ $TENSORFLOW_VERSION == "2.14.0" ]; then \
pip install --no-cache-dir "numpy<2"; fi pip install --no-cache-dir "numpy<2"; fi
# ----- paddlepaddle install ----- # ----- paddlepaddle install -----
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment