feat: 稳定版本的tensorflow分支

d445a280 · chenpangpang · 5e6e34ed · d445a280 · d445a280
Commit d445a280 authored Oct 29, 2024 by chenpangpang
Hide whitespace changes
Inline Side-by-side

Showing with 181 additions and 140 deletions

auto_build.py auto_build.py +175 -134

build_space/Dockerfile.jupyterlab_ubuntu build_space/Dockerfile.jupyterlab_ubuntu +6 -6

No files found.
--- a/auto_build.py
+++ b/auto_build.py
@@ -2,187 +2,228 @@ import pandas as pd
 import re
 import subprocess
 import os
-import sys
 import shutil
 import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
+import argparse
+import logging

-# 检查命令行参数，确保提供了 Excel 文件路径
-if len(sys.argv) < 2:
-    print("请提供 Excel 文件路径作为参数")
-    sys.exit(1)

-# 获取Excel文件路径
-excel_file_path = sys.argv[1]
+class MyLogger:
+    def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
+        self.logger_name = logger_name
+        self.log_file = log_file
+        self.vlog = logging.getLogger(logger_name)
+        self.vlog.setLevel(level)

-# 读取Excel文件
-df = pd.read_excel(excel_file_path)
+        self.file_handler = logging.FileHandler(log_file)
+        formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
+        self.file_handler.setFormatter(formatter)
+        self.vlog.addHandler(self.file_handler)

-# 确保结果文件夹存在
-result_dir = "result"
-os.makedirs(result_dir, exist_ok=True)
+        if console_handler:
+            self.console_handler = logging.StreamHandler()
+            self.console_handler.setFormatter(formatter)
+            self.console_handler.setLevel(level)
+            self.console_handler.setLevel(level)
+            self.vlog.addHandler(self.console_handler)
+
+    def get_vlog(self):
+        return self.vlog
+
+    def __del__(self):
+        self.vlog.removeHandler(self.file_handler)
+        if self.console_handler is not None:
+            self.vlog.removeHandler(self.console_handler)

-log_file = "ok.txt"  # 定义日志文件的名称
-max_retries = 3  # 最大重试次数
-retry_delay = 5  # 重试前等待的秒数

 # 定义一个用于打包和传输的函数
-def package_and_transfer(image_name, tar_file, image_result_dir):
+def package_and_transfer(image_name, tar_file, image_result_dir, logger):
    # 打包镜像
    save_commands = [
-        f"sh script/save.sh {image_name}",
+        f"sh script/save.sh {image_name} > /dev/null 2>&1",
        f"mv {tar_file} {image_result_dir}/"
    ]

    for save_command in save_commands:
-        print(f"打包镜像: {save_command}")
+        logger.info(f"打包镜像: {save_command}")
        subprocess.run(save_command, shell=True)

-    print(f"镜像 {image_name} 已成功打包 {tar_file}")
+    logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")

    # 准备执行远程传输命令
    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
-    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ > {recvlog_file}'
-
+    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
    # 打印并执行 rsync 远程传输命令
-    print(f"远程传输命令: {rsync_command}")
-
+    logger.info(f"远程传输命令: {rsync_command}")
    retries = 0
-    while retries < max_retries:
+    while retries < args.trans_retry_max_num:
        try:
            subprocess.run(rsync_command, shell=True, check=True)
-            print(f"镜像 {tar_file} 传输成功，日志保存到 {recvlog_file}")
+            logger.info(f"镜像 {tar_file} 传输成功，日志保存到 {recvlog_file}")

            # 传输成功后，将镜像名称追加到日志文件中
-            with open(log_file, "a") as log:
+            with open(args.ok_file, "a") as log:
                log.write(f"{image_name} 成功传输\n")

            # 传输成功后删除 .tar 文件
            tar_file_path = os.path.join(image_result_dir, tar_file)
            if os.path.exists(tar_file_path):
                os.remove(tar_file_path)
-                print(f"{tar_file_path} 已删除")
+                logger.info(f"{tar_file_path} 已删除")

            # 传输成功后删除 recvlog 文件
            if os.path.exists(recvlog_file):
                os.remove(recvlog_file)
-                print(f"{recvlog_file} 已删除")
+                logger.info(f"{recvlog_file} 已删除")

            break  # 成功后跳出重试循环

        except subprocess.CalledProcessError:
            retries += 1
-            print(f"镜像 {tar_file} 传输失败，尝试重试 {retries}/{max_retries} 次")
-            if retries < max_retries:
-                time.sleep(retry_delay)  # 等待一段时间再重试
+            logger.info(f"镜像 {tar_file} 传输失败，尝试重试 {retries}/{args.trans_retry_num} 次")
+            if retries < args.trans_retry_num:
+                time.sleep(args.trans_retry_delay)  # 等待一段时间再重试
            else:
-                print(f"传输失败超过最大重试次数，跳过镜像 {image_name}")
-                with open(log_file, "a") as log:
+                logger.warning(f"传输失败超过最大重试次数，跳过镜像 {image_name}")
+                with open(args.ok_file, "a") as log:
                    log.write(f"{image_name} 传输失败\n")
                break  # 超过最大重试次数后，跳过这个镜像

-    print(f"==== 镜像 {image_name} 传输完毕  ====")
-
-
-# 创建线程池
-with ThreadPoolExecutor() as executor:
-    # 遍历每一行数据，自动构建镜像
-    for index, row in df.iterrows():
-        image_name = row['镜像名']
-        base_image = row['基础镜像']
-        framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
-        other_dependencies = row['其他依赖包']
-        conda_url = row['conda url']  # 获取conda URL
-
-        # 处理 NaN 情况：确保 base_image 是字符串
-        if pd.isna(base_image):
-            print(f"基础镜像信息缺失，跳过该行: {image_name}")
-            continue
-
-        # 提取 torchvision 和 torchaudio 版本号
-        torchvision_version = None
-        torchaudio_version = None
-        if pd.notna(other_dependencies):
-            # 使用正则表达式提取torchvision和torchaudio版本
-            match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
-            match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
-            if match_vision:
-                torchvision_version = match_vision.group(1)
-            if match_audio:
-                torchaudio_version = match_audio.group(1)
-
-        # 如果未找到torchvision或torchaudio的版本，默认设置为空
-        if torchvision_version is None:
-            torchvision_version = "未找到版本号"
-        if torchaudio_version is None:
-            torchaudio_version = "未找到版本号"
-
-        # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
-        if isinstance(base_image, str):
-            if "pytorch" in image_name:
-                if "pytorch/pytorch" in base_image:
-                    # 构建 PyTorch 镜像的命令
-                    build_command = f"""
-                    cd build_space && \
-                    ./build_ubuntu.sh jupyterlab {image_name} {base_image}
-                    """
-                else:
-                    # 构建 NVIDIA 镜像的命令
+    logger.info(f"==== 镜像 {image_name} 传输完毕  ====")
+
+
+def run():
+    # 读取Excel文件
+    df = pd.read_excel(args.input_file)
+    os.makedirs(args.log_dir, exist_ok=True)
+
+    # 创建线程池
+    with ThreadPoolExecutor() as executor:
+        # 遍历每一行数据，自动构建镜像
+        for index, row in df.iterrows():
+            image_name = row['镜像名']
+            base_image = row['基础镜像']
+            framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
+            other_dependencies = row['其他依赖包']
+            conda_url = row['conda url']  # 获取conda URL
+
+            # 日志文件
+            if os.path.exists(os.path.join(args.log_dir, image_name)):
+                shutil.rmtree(os.path.join(args.log_dir, image_name))
+            os.makedirs(os.path.join(args.log_dir, image_name))
+            my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
+            logger = my_logger.get_vlog()
+            # 处理 NaN 情况：确保 base_image 是字符串
+            if pd.isna(base_image):
+                logger.error(f"基础镜像信息缺失，跳过该行: {image_name}")
+                continue
+
+            # 提取 torchvision 和 torchaudio 版本号
+            torchvision_version = None
+            torchaudio_version = None
+            if pd.notna(other_dependencies):
+                # 使用正则表达式提取torchvision和torchaudio版本
+                match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
+                match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
+                if match_vision:
+                    torchvision_version = match_vision.group(1)
+                if match_audio:
+                    torchaudio_version = match_audio.group(1)
+
+            # 如果未找到torchvision或torchaudio的版本，默认设置为空
+            if torchvision_version is None or torchaudio_version is None:
+                torchvision_version = "未找到版本号"
+            if torchaudio_version is None:
+                torchaudio_version = "未找到版本号"
+
+            # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
+            if isinstance(base_image, str):
+                if "pytorch" in image_name:
+                    if "pytorch/pytorch" in base_image:
+                        # 构建 PyTorch 镜像的命令
+                        build_command = f"""
+                        cd build_space && \
+                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
+                        """
+                    else:
+                        # 构建 NVIDIA 镜像的命令
+                        build_command = f"""
+                        cd build_space && \
+                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                        TORCH_VERSION="{framework_version}" \
+                        TORCHVISION_VERSION="{torchvision_version}" \
+                        TORCHAUDIO_VERSION="{torchaudio_version}" \
+                        CONDA_URL="{conda_url}" \
+                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
+                        """
+                elif "tensorflow" in image_name:
                    build_command = f"""
                    cd build_space && \
                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
-                    TORCH_VERSION="{framework_version}" \
-                    TORCHVISION_VERSION="{torchvision_version}" \
-                    TORCHAUDIO_VERSION="{torchaudio_version}" \
-                    CONDA_URL="{conda_url}"
+                    TENSORFLOW_VERSION="{framework_version}" \
+                    CONDA_URL="{conda_url}" \
+                    2>&1 | tee ../{args.log_dir}/{image_name}/build.log
                    """
-            elif "tensorflow" in image_name:
-                build_command = f"""
-                cd build_space && \
-                ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
-                TENSORFLOW_VERSION="{framework_version}" \
-                CONDA_URL="{conda_url}"
-                """
-
-        # 打印构建命令（用于调试）
-        print(build_command)
-
-        # 执行构建命令，捕获异常
-        try:
-            print(f"==== 镜像 {image_name} 开始构建  ====")
-            subprocess.run(build_command, shell=True, check=True)
-        except subprocess.CalledProcessError:
-            print(f"==== 镜像 {image_name} 构建失败，跳过该镜像 ====")
-            continue  # 继续执行下一个镜像
-
-        # 创建与镜像名称对应的文件夹，用于保存测试结果
-        image_result_dir = os.path.join(result_dir, image_name.replace('/', '_'))
-
-        # 如果目录已存在，先删除再创建
-        if os.path.exists(image_result_dir):
-            shutil.rmtree(image_result_dir)
-
-        # 重新创建目录
-        os.makedirs(image_result_dir, exist_ok=True)
-
-        # 执行测试并将日志保存到相应的目录
-        test_commands = [
-            f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
-            f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
-            f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
-        ]
-        if "pytorch" in image_name:
-            test_commands.append(f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
-
-        # 执行测试命令
-        for test_command in test_commands:
-            print(f"执行测试: {test_command}")
-            subprocess.run(test_command, shell=True)
-
-        # 生成打包后的镜像文件名，替换 ":" 为 "-" 并添加 ".tar" 后缀
-        tar_file = f"{image_name.replace(':', '-')}.tar"
-
-        # 提交打包和传输任务到后台线程池，继续执行下一个构建任务
-        executor.submit(package_and_transfer, image_name, tar_file, image_result_dir)
-

+            # 打印构建命令（用于调试）
+            logger.info(build_command)
+
+            # 执行构建命令，捕获异常
+            try:
+                logger.info(f"==== 镜像 {image_name} 开始构建  ====")
+                subprocess.run(build_command, shell=True, check=True)
+            except subprocess.CalledProcessError:
+                logger.info(f"==== 镜像 {image_name} 构建失败，跳过该镜像 ====")
+                continue  # 继续执行下一个镜像
+
+            # 创建与镜像名称对应的文件夹，用于保存测试结果
+            image_result_dir = os.path.join(args.log_dir, image_name)
+
+            # 执行测试并将日志保存到相应的目录
+            test_commands = [
+                f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
+                f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
+                f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
+            ]
+            if "pytorch" in image_name:
+                test_commands.append(
+                    f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
+
+            # # 执行测试命令
+            # for test_command in test_commands:
+            #     logger.info(f"执行测试: {test_command}")
+            #     subprocess.run(test_command, shell=True)
+
+            # 生成打包后的镜像文件名，替换 ":" 为 "-" 并添加 ".tar" 后缀
+            tar_file = f"{image_name.replace(':', '-')}.tar"
+
+            if not args.no_save_trans:
+                # 提交打包和传输任务到后台线程池，继续执行下一个构建任务
+                executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
+    parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
+                        help='a excel file with images to build.')
+    parser.add_argument('--index', type=str,
+                        help='the indexes for images to build, separated by ","')
+    parser.add_argument('--num', type=int,
+                        help='the number of images to build')
+    parser.add_argument('--log-dir', type=str, default="logs",
+                        help='logs directory')
+    parser.add_argument('--ok-file', type=str, default="ok.txt",
+                        help='the file of succeed images')
+    parser.add_argument('--trans-retry-max-num', type=int, default=3,
+                        help='transform retry max num')
+    parser.add_argument('--trans-retry-delay', type=int, default=5,
+                        help='transform delay seconds')
+    parser.add_argument('--des-path', type=str,
+                        default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
+                        help='destination path in scnet')
+    parser.add_argument("--no-save-trans", action="store_true",
+                        help="do not save and transform image")
+    args = parser.parse_args()
+    run()
--- a/build_space/Dockerfile.jupyterlab_ubuntu
+++ b/build_space/Dockerfile.jupyterlab_ubuntu
@@ -94,12 +94,12 @@ RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
    python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
    CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
    echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
-    echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc; \
-    elif [  $TENSORFLOW_VERSION == "2.7.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
-        pip install --no-cache-dir protobuf==3.20.*; \
-    elif [  $TENSORFLOW_VERSION == "2.4.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
-        pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*; \
-    elif [  $TENSORFLOW_VERSION == "2.10.0" ] || [  $TENSORFLOW_VERSION == "2.11.0" ] || [  $TENSORFLOW_VERSION == "2.9.0" ]|| [  $TENSORFLOW_VERSION == "2.9.3" ]; then \
+    echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;fi && \
+    if [  $TENSORFLOW_VERSION == "2.8.0" ] || [  $TENSORFLOW_VERSION == "2.7.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
+        pip install --no-cache-dir protobuf==3.20.*;fi && \
+    if [  $TENSORFLOW_VERSION == "2.4.0" ] || [  $TENSORFLOW_VERSION == "2.5.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
+        pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;fi && \
+    if [  $TENSORFLOW_VERSION == "2.8.0" ] || [  $TENSORFLOW_VERSION == "2.10.0" ] || [  $TENSORFLOW_VERSION == "2.11.0" ] || [  $TENSORFLOW_VERSION == "2.9.0" ] || [  $TENSORFLOW_VERSION == "2.9.3" ] || [  $TENSORFLOW_VERSION == "2.14.0" ]; then \
        pip install --no-cache-dir "numpy<2"; fi

 # ----- paddlepaddle install -----