auto_build.py 12.4 KB
Newer Older
chenpangpang's avatar
chenpangpang committed
1
2
3
4
5
6
import pandas as pd
import re
import subprocess
import os
import shutil
import time
7
8
9
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import argparse
import logging
chenpangpang's avatar
chenpangpang committed
10
11
import json
from packaging.version import Version
chenpangpang's avatar
chenpangpang committed
12

13
14
15
16
17
18
class MyLogger:
    def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
        self.logger_name = logger_name
        self.log_file = log_file
        self.vlog = logging.getLogger(logger_name)
        self.vlog.setLevel(level)
chenpangpang's avatar
chenpangpang committed
19

20
21
22
23
        self.file_handler = logging.FileHandler(log_file)
        formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
        self.file_handler.setFormatter(formatter)
        self.vlog.addHandler(self.file_handler)
chenpangpang's avatar
chenpangpang committed
24

25
26
27
28
29
30
31
32
33
34
35
36
37
38
        if console_handler:
            self.console_handler = logging.StreamHandler()
            self.console_handler.setFormatter(formatter)
            self.console_handler.setLevel(level)
            self.console_handler.setLevel(level)
            self.vlog.addHandler(self.console_handler)

    def get_vlog(self):
        return self.vlog

    def __del__(self):
        self.vlog.removeHandler(self.file_handler)
        if self.console_handler is not None:
            self.vlog.removeHandler(self.console_handler)
chenpangpang's avatar
chenpangpang committed
39
40
41


# 定义一个用于打包和传输的函数
42
def package_and_transfer(image_name, tar_file, image_result_dir, logger):
chenpangpang's avatar
chenpangpang committed
43
44
    # 打包镜像
    save_commands = [
45
        f"sh script/save.sh {image_name} > /dev/null 2>&1",
chenpangpang's avatar
chenpangpang committed
46
47
48
49
        f"mv {tar_file} {image_result_dir}/"
    ]

    for save_command in save_commands:
50
        logger.info(f"打包镜像: {save_command}")
chenpangpang's avatar
chenpangpang committed
51
52
        subprocess.run(save_command, shell=True)

53
    logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")
chenpangpang's avatar
chenpangpang committed
54
55
56

    # 准备执行远程传输命令
    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
57
    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
chenpangpang's avatar
chenpangpang committed
58
    # 打印并执行 rsync 远程传输命令
59
    logger.info(f"远程传输命令: {rsync_command}")
chenpangpang's avatar
chenpangpang committed
60
    retries = 0
61
    while retries < args.trans_retry_max_num:
chenpangpang's avatar
chenpangpang committed
62
63
        try:
            subprocess.run(rsync_command, shell=True, check=True)
64
            logger.info(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}")
chenpangpang's avatar
chenpangpang committed
65
66

            # 传输成功后,将镜像名称追加到日志文件中
67
            with open(args.ok_file, "a") as log:
chenpangpang's avatar
chenpangpang committed
68
69
70
71
72
73
                log.write(f"{image_name} 成功传输\n")

            # 传输成功后删除 .tar 文件
            tar_file_path = os.path.join(image_result_dir, tar_file)
            if os.path.exists(tar_file_path):
                os.remove(tar_file_path)
74
                logger.info(f"{tar_file_path} 已删除")
chenpangpang's avatar
chenpangpang committed
75
76
77
78

            # 传输成功后删除 recvlog 文件
            if os.path.exists(recvlog_file):
                os.remove(recvlog_file)
79
                logger.info(f"{recvlog_file} 已删除")
chenpangpang's avatar
chenpangpang committed
80
81
82
83
84

            break  # 成功后跳出重试循环

        except subprocess.CalledProcessError:
            retries += 1
85
86
87
            logger.info(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{args.trans_retry_num} 次")
            if retries < args.trans_retry_num:
                time.sleep(args.trans_retry_delay)  # 等待一段时间再重试
chenpangpang's avatar
chenpangpang committed
88
            else:
89
90
                logger.warning(f"传输失败超过最大重试次数,跳过镜像 {image_name}")
                with open(args.ok_file, "a") as log:
chenpangpang's avatar
chenpangpang committed
91
92
93
                    log.write(f"{image_name} 传输失败\n")
                break  # 超过最大重试次数后,跳过这个镜像

94
95
    logger.info(f"==== 镜像 {image_name} 传输完毕  ====")

chenpangpang's avatar
chenpangpang committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# 从json中获取paddle安装信息
def get_paddle_info(paddlepaddle_version, cuda_version):
    # 读取 JSON 数据
    with open("attach/paddle.json", "r", encoding="utf-8") as file:
        version_data = json.load(file)
    for item in version_data:
        if item["paddle_excel_version"] == paddlepaddle_version and item["cuda_version"] == cuda_version:
            return {
                "paddle_version": item["paddle_version"],
                "paddlenlp_version": item["paddlenlp_version"],
                "paddle_url": item["paddle_url"]
            }
    return None

110
111
112
113
114

def run():
    # 读取Excel文件
    df = pd.read_excel(args.input_file)
    os.makedirs(args.log_dir, exist_ok=True)
chenpangpang's avatar
chenpangpang committed
115
116
117
118
    
    paddle_version = None
    paddlenlp_version = None
    paddle_url = None
119
120
121
122
123
124
125
126
127
    # 创建线程池
    with ThreadPoolExecutor() as executor:
        # 遍历每一行数据,自动构建镜像
        for index, row in df.iterrows():
            image_name = row['镜像名']
            base_image = row['基础镜像']
            framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
            other_dependencies = row['其他依赖包']
            conda_url = row['conda url']  # 获取conda URL
chenpangpang's avatar
chenpangpang committed
128
            cuda_version = row['Runtime版本'].strip().lower()  # 获取 CUDA 版本
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

            # 日志文件
            if os.path.exists(os.path.join(args.log_dir, image_name)):
                shutil.rmtree(os.path.join(args.log_dir, image_name))
            os.makedirs(os.path.join(args.log_dir, image_name))
            my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
            logger = my_logger.get_vlog()
            # 处理 NaN 情况:确保 base_image 是字符串
            if pd.isna(base_image):
                logger.error(f"基础镜像信息缺失,跳过该行: {image_name}")
                continue

            # 提取 torchvision 和 torchaudio 版本号
            torchvision_version = None
            torchaudio_version = None
            if pd.notna(other_dependencies):
                # 使用正则表达式提取torchvision和torchaudio版本
                match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
                match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
                if match_vision:
                    torchvision_version = match_vision.group(1)
                if match_audio:
                    torchaudio_version = match_audio.group(1)

            # 如果未找到torchvision或torchaudio的版本,默认设置为空
            if torchvision_version is None or torchaudio_version is None:
                torchvision_version = "未找到版本号"
            if torchaudio_version is None:
                torchaudio_version = "未找到版本号"

chenpangpang's avatar
chenpangpang committed
159
160
161
162
163
164
165
166
167
168
169
            # 处理比较复杂的下载或依赖关系
            if isinstance(base_image, str):
                if "paddle" in image_name:
                    paddle_info = get_paddle_info(str(framework_version), str(cuda_version))
                    if paddle_info:
                        paddle_version = paddle_info["paddle_version"]
                        paddlenlp_version = paddle_info["paddlenlp_version"]
                        paddle_url = paddle_info["paddle_url"]
                    else:
                        print("未找到指定的 PaddlePaddle 和 CUDA 版本信息")

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
            # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
            if isinstance(base_image, str):
                if "pytorch" in image_name:
                    if "pytorch/pytorch" in base_image:
                        # 构建 PyTorch 镜像的命令
                        build_command = f"""
                        cd build_space && \
                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
                        """
                    else:
                        # 构建 NVIDIA 镜像的命令
                        build_command = f"""
                        cd build_space && \
                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                        TORCH_VERSION="{framework_version}" \
                        TORCHVISION_VERSION="{torchvision_version}" \
                        TORCHAUDIO_VERSION="{torchaudio_version}" \
                        CONDA_URL="{conda_url}" \
                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
                        """
                elif "tensorflow" in image_name:
chenpangpang's avatar
chenpangpang committed
192
193
194
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
195
196
197
                    TENSORFLOW_VERSION="{framework_version}" \
                    CONDA_URL="{conda_url}" \
                    2>&1 | tee ../{args.log_dir}/{image_name}/build.log
chenpangpang's avatar
chenpangpang committed
198
                    """
chenpangpang's avatar
chenpangpang committed
199
200
201
202
203
204
205
206
207
208
209
                elif "paddle" in image_name:
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
                    PADDLEPADDLE_VERSION="{paddle_version}" \
                    PADDLENLP_VERSION="{paddlenlp_version}" \
                    CONDA_URL="{conda_url}" \
                    PADDLE_URL="{paddle_url}" \
                    2>&1 | tee ../{args.log_dir}/{image_name}/build.log
                    """

chenpangpang's avatar
chenpangpang committed
210

211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
            # 打印构建命令(用于调试)
            logger.info(build_command)

            # 执行构建命令,捕获异常
            try:
                logger.info(f"==== 镜像 {image_name} 开始构建  ====")
                subprocess.run(build_command, shell=True, check=True)
            except subprocess.CalledProcessError:
                logger.info(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
                continue  # 继续执行下一个镜像

            # 创建与镜像名称对应的文件夹,用于保存测试结果
            image_result_dir = os.path.join(args.log_dir, image_name)

            # 执行测试并将日志保存到相应的目录
            test_commands = [
                f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
                f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
                f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
            ]
            if "pytorch" in image_name:
                test_commands.append(
                    f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
chenpangpang's avatar
chenpangpang committed
234
235
236
237
238
            elif "paddle" in image_name:
                # 使用 Version 进行版本比较
                if Version(paddle_version) >= Version("2.4"):
                    test_commands.append(
                        f"mv gpu-base-image-test/paddle/output.png {image_result_dir}")
239

chenpangpang's avatar
chenpangpang committed
240
241
242
243
            # 执行测试命令
            for test_command in test_commands:
                logger.info(f"执行测试: {test_command}")
                subprocess.run(test_command, shell=True)
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

            # 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
            tar_file = f"{image_name.replace(':', '-')}.tar"

            if not args.no_save_trans:
                # 提交打包和传输任务到后台线程池,继续执行下一个构建任务
                executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
    parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
                        help='a excel file with images to build.')
    parser.add_argument('--index', type=str,
                        help='the indexes for images to build, separated by ","')
    parser.add_argument('--num', type=int,
                        help='the number of images to build')
    parser.add_argument('--log-dir', type=str, default="logs",
                        help='logs directory')
    parser.add_argument('--ok-file', type=str, default="ok.txt",
                        help='the file of succeed images')
    parser.add_argument('--trans-retry-max-num', type=int, default=3,
                        help='transform retry max num')
    parser.add_argument('--trans-retry-delay', type=int, default=5,
                        help='transform delay seconds')
    parser.add_argument('--des-path', type=str,
                        default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
                        help='destination path in scnet')
    parser.add_argument("--no-save-trans", action="store_true",
                        help="do not save and transform image")
    args = parser.parse_args()
    run()