auto_build.py 7.39 KB
Newer Older
chenpangpang's avatar
chenpangpang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import pandas as pd
import re
import subprocess
import os
import sys
import shutil
import time
from concurrent.futures import ThreadPoolExecutor

# 检查命令行参数,确保提供了 Excel 文件路径
if len(sys.argv) < 2:
    print("请提供 Excel 文件路径作为参数")
    sys.exit(1)

# 获取Excel文件路径
excel_file_path = sys.argv[1]

# 读取Excel文件
df = pd.read_excel(excel_file_path)

# 确保结果文件夹存在
result_dir = "result"
os.makedirs(result_dir, exist_ok=True)

log_file = "ok.txt"  # 定义日志文件的名称
max_retries = 3  # 最大重试次数
retry_delay = 5  # 重试前等待的秒数

# 定义一个用于打包和传输的函数
def package_and_transfer(image_name, tar_file, image_result_dir):
    # 打包镜像
    save_commands = [
        f"sh script/save.sh {image_name}",
        f"mv {tar_file} {image_result_dir}/"
    ]

    for save_command in save_commands:
        print(f"打包镜像: {save_command}")
        subprocess.run(save_command, shell=True)

    print(f"镜像 {image_name} 已成功打包 {tar_file}")

    # 准备执行远程传输命令
    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ > {recvlog_file}'

    # 打印并执行 rsync 远程传输命令
    print(f"远程传输命令: {rsync_command}")

    retries = 0
    while retries < max_retries:
        try:
            subprocess.run(rsync_command, shell=True, check=True)
            print(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}")

            # 传输成功后,将镜像名称追加到日志文件中
            with open(log_file, "a") as log:
                log.write(f"{image_name} 成功传输\n")

            # 传输成功后删除 .tar 文件
            tar_file_path = os.path.join(image_result_dir, tar_file)
            if os.path.exists(tar_file_path):
                os.remove(tar_file_path)
                print(f"{tar_file_path} 已删除")

            # 传输成功后删除 recvlog 文件
            if os.path.exists(recvlog_file):
                os.remove(recvlog_file)
                print(f"{recvlog_file} 已删除")

            break  # 成功后跳出重试循环

        except subprocess.CalledProcessError:
            retries += 1
            print(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{max_retries} 次")
            if retries < max_retries:
                time.sleep(retry_delay)  # 等待一段时间再重试
            else:
                print(f"传输失败超过最大重试次数,跳过镜像 {image_name}")
                with open(log_file, "a") as log:
                    log.write(f"{image_name} 传输失败\n")
                break  # 超过最大重试次数后,跳过这个镜像

    print(f"==== 镜像 {image_name} 传输完毕  ====")


# 创建线程池
with ThreadPoolExecutor() as executor:
    # 遍历每一行数据,自动构建镜像
    for index, row in df.iterrows():
        image_name = row['镜像名']
        base_image = row['基础镜像']
        framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
        other_dependencies = row['其他依赖包']
        conda_url = row['conda url']  # 获取conda URL

        # 处理 NaN 情况:确保 base_image 是字符串
        if pd.isna(base_image):
            print(f"基础镜像信息缺失,跳过该行: {image_name}")
            continue

        # 提取 torchvision 和 torchaudio 版本号
        torchvision_version = None
        torchaudio_version = None
        if pd.notna(other_dependencies):
            # 使用正则表达式提取torchvision和torchaudio版本
            match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
            match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
            if match_vision:
                torchvision_version = match_vision.group(1)
            if match_audio:
                torchaudio_version = match_audio.group(1)

        # 如果未找到torchvision或torchaudio的版本,默认设置为空
        if torchvision_version is None:
            torchvision_version = "未找到版本号"
        if torchaudio_version is None:
            torchaudio_version = "未找到版本号"

        # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
        if isinstance(base_image, str):
            if "pytorch" in image_name:
                if "pytorch/pytorch" in base_image:
                    # 构建 PyTorch 镜像的命令
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image}
                    """
                else:
                    # 构建 NVIDIA 镜像的命令
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                    TORCH_VERSION="{framework_version}" \
                    TORCHVISION_VERSION="{torchvision_version}" \
                    TORCHAUDIO_VERSION="{torchaudio_version}" \
                    CONDA_URL="{conda_url}"
                    """
            elif "tensorflow" in image_name:
                build_command = f"""
                cd build_space && \
                ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                TENSORFLOW_VERSION="{framework_version}" \
                CONDA_URL="{conda_url}"
                """

        # 打印构建命令(用于调试)
        print(build_command)

        # 执行构建命令,捕获异常
        try:
            print(f"==== 镜像 {image_name} 开始构建  ====")
            subprocess.run(build_command, shell=True, check=True)
        except subprocess.CalledProcessError:
            print(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
            continue  # 继续执行下一个镜像

        # 创建与镜像名称对应的文件夹,用于保存测试结果
        image_result_dir = os.path.join(result_dir, image_name.replace('/', '_'))

        # 如果目录已存在,先删除再创建
        if os.path.exists(image_result_dir):
            shutil.rmtree(image_result_dir)

        # 重新创建目录
        os.makedirs(image_result_dir, exist_ok=True)

        # 执行测试并将日志保存到相应的目录
        test_commands = [
            f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
            f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
            f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
        ]
        if "pytorch" in image_name:
            test_commands.append(f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")

        # 执行测试命令
        for test_command in test_commands:
            print(f"执行测试: {test_command}")
            subprocess.run(test_command, shell=True)

        # 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
        tar_file = f"{image_name.replace(':', '-')}.tar"

        # 提交打包和传输任务到后台线程池,继续执行下一个构建任务
        executor.submit(package_and_transfer, image_name, tar_file, image_result_dir)