auto_build.py

import pandas as pd
import re
import subprocess
import os
import sys
import shutil
import time
from concurrent.futures import ThreadPoolExecutor

# 检查命令行参数，确保提供了 Excel 文件路径
if len(sys.argv) < 2:
    print("请提供 Excel 文件路径作为参数")
    sys.exit(1)

# 获取Excel文件路径
excel_file_path = sys.argv[1]

# 读取Excel文件
df = pd.read_excel(excel_file_path)

# 确保结果文件夹存在
result_dir = "result"
os.makedirs(result_dir, exist_ok=True)

log_file = "ok.txt"  # 定义日志文件的名称
max_retries = 3  # 最大重试次数
retry_delay = 5  # 重试前等待的秒数

# 定义一个用于打包和传输的函数
def package_and_transfer(image_name, tar_file, image_result_dir):
    # 打包镜像
    save_commands = [
        f"sh script/save.sh {image_name}",
        f"mv {tar_file} {image_result_dir}/"
    ]

    for save_command in save_commands:
        print(f"打包镜像: {save_command}")
        subprocess.run(save_command, shell=True)

    print(f"镜像 {image_name} 已成功打包 {tar_file}")

    # 准备执行远程传输命令
    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ > {recvlog_file}'

    # 打印并执行 rsync 远程传输命令
    print(f"远程传输命令: {rsync_command}")

    retries = 0
    while retries < max_retries:
        try:
            subprocess.run(rsync_command, shell=True, check=True)
            print(f"镜像 {tar_file} 传输成功，日志保存到 {recvlog_file}")

            # 传输成功后，将镜像名称追加到日志文件中
            with open(log_file, "a") as log:
                log.write(f"{image_name} 成功传输\n")

            # 传输成功后删除 .tar 文件
            tar_file_path = os.path.join(image_result_dir, tar_file)
            if os.path.exists(tar_file_path):
                os.remove(tar_file_path)
                print(f"{tar_file_path} 已删除")

            # 传输成功后删除 recvlog 文件
            if os.path.exists(recvlog_file):
                os.remove(recvlog_file)
                print(f"{recvlog_file} 已删除")

            break  # 成功后跳出重试循环

        except subprocess.CalledProcessError:
            retries += 1
            print(f"镜像 {tar_file} 传输失败，尝试重试 {retries}/{max_retries} 次")
            if retries < max_retries:
                time.sleep(retry_delay)  # 等待一段时间再重试
            else:
                print(f"传输失败超过最大重试次数，跳过镜像 {image_name}")
                with open(log_file, "a") as log:
                    log.write(f"{image_name} 传输失败\n")
                break  # 超过最大重试次数后，跳过这个镜像

    print(f"==== 镜像 {image_name} 传输完毕  ====")


# 创建线程池
with ThreadPoolExecutor() as executor:
    # 遍历每一行数据，自动构建镜像
    for index, row in df.iterrows():
        image_name = row['镜像名']
        base_image = row['基础镜像']
        framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
        other_dependencies = row['其他依赖包']
        conda_url = row['conda url']  # 获取conda URL

        # 处理 NaN 情况：确保 base_image 是字符串
        if pd.isna(base_image):
            print(f"基础镜像信息缺失，跳过该行: {image_name}")
            continue

        # 提取 torchvision 和 torchaudio 版本号
        torchvision_version = None
        torchaudio_version = None
        if pd.notna(other_dependencies):
            # 使用正则表达式提取torchvision和torchaudio版本
            match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
            match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
            if match_vision:
                torchvision_version = match_vision.group(1)
            if match_audio:
                torchaudio_version = match_audio.group(1)

        # 如果未找到torchvision或torchaudio的版本，默认设置为空
        if torchvision_version is None:
            torchvision_version = "未找到版本号"
        if torchaudio_version is None:
            torchaudio_version = "未找到版本号"

        # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
        if isinstance(base_image, str):
            if "pytorch" in image_name:
                if "pytorch/pytorch" in base_image:
                    # 构建 PyTorch 镜像的命令
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image}
                    """
                else:
                    # 构建 NVIDIA 镜像的命令
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                    TORCH_VERSION="{framework_version}" \
                    TORCHVISION_VERSION="{torchvision_version}" \
                    TORCHAUDIO_VERSION="{torchaudio_version}" \
                    CONDA_URL="{conda_url}"
                    """
            elif "tensorflow" in image_name:
                build_command = f"""
                cd build_space && \
                ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                TENSORFLOW_VERSION="{framework_version}" \
                CONDA_URL="{conda_url}"
                """

        # 打印构建命令（用于调试）
        print(build_command)

        # 执行构建命令，捕获异常
        try:
            print(f"==== 镜像 {image_name} 开始构建  ====")
            subprocess.run(build_command, shell=True, check=True)
        except subprocess.CalledProcessError:
            print(f"==== 镜像 {image_name} 构建失败，跳过该镜像 ====")
            continue  # 继续执行下一个镜像

        # 创建与镜像名称对应的文件夹，用于保存测试结果
        image_result_dir = os.path.join(result_dir, image_name.replace('/', '_'))

        # 如果目录已存在，先删除再创建
        if os.path.exists(image_result_dir):
            shutil.rmtree(image_result_dir)

        # 重新创建目录
        os.makedirs(image_result_dir, exist_ok=True)

        # 执行测试并将日志保存到相应的目录
        test_commands = [
            f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
            f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
            f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
        ]
        if "pytorch" in image_name:
            test_commands.append(f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")

        # 执行测试命令
        for test_command in test_commands:
            print(f"执行测试: {test_command}")
            subprocess.run(test_command, shell=True)

        # 生成打包后的镜像文件名，替换 ":" 为 "-" 并添加 ".tar" 后缀
        tar_file = f"{image_name.replace(':', '-')}.tar"

        # 提交打包和传输任务到后台线程池，继续执行下一个构建任务
        executor.submit(package_and_transfer, image_name, tar_file, image_result_dir)