feat: 删除numpy安装

d01c8cdc · chenpangpang · 67624b99 · d01c8cdc · d01c8cdc · d01c8cdc
Commit d01c8cdc authored Oct 22, 2024 by chenpangpang
8 changed files
--- a/auto_build.py
+++ b/auto_build.py
+import pandas as pd
+import re
+import subprocess
+import os
+import sys
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+# 检查命令行参数，确保提供了 Excel 文件路径
+if len(sys.argv) < 2:
+    print("请提供 Excel 文件路径作为参数")
+    sys.exit(1)
+
+# 获取Excel文件路径
+excel_file_path = sys.argv[1]
+
+# 读取Excel文件
+df = pd.read_excel(excel_file_path)
+
+# 确保结果文件夹存在
+result_dir = "result"
+os.makedirs(result_dir, exist_ok=True)
+
+log_file = "ok.txt"  # 定义日志文件的名称
+max_retries = 3  # 最大重试次数
+retry_delay = 5  # 重试前等待的秒数
+
+# 定义一个用于打包和传输的函数
+def package_and_transfer(image_name, tar_file, image_result_dir):
+    # 打包镜像
+    save_commands = [
+        f"sh script/save.sh {image_name}",
+        f"mv {tar_file} {image_result_dir}/"
+    ]
+
+    for save_command in save_commands:
+        print(f"打包镜像: {save_command}")
+        subprocess.run(save_command, shell=True)
+
+    print(f"镜像 {image_name} 已成功打包 {tar_file}")
+
+    # 准备执行远程传输命令
+    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
+    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/ > {recvlog_file}'
+
+    # 打印并执行 rsync 远程传输命令
+    print(f"远程传输命令: {rsync_command}")
+
+    retries = 0
+    while retries < max_retries:
+        try:
+            subprocess.run(rsync_command, shell=True, check=True)
+            print(f"镜像 {tar_file} 传输成功，日志保存到 {recvlog_file}")
+
+            # 传输成功后，将镜像名称追加到日志文件中
+            with open(log_file, "a") as log:
+                log.write(f"{image_name} 成功传输\n")
+
+            # 传输成功后删除 .tar 文件
+            tar_file_path = os.path.join(image_result_dir, tar_file)
+            if os.path.exists(tar_file_path):
+                os.remove(tar_file_path)
+                print(f"{tar_file_path} 已删除")
+
+            # 传输成功后删除 recvlog 文件
+            if os.path.exists(recvlog_file):
+                os.remove(recvlog_file)
+                print(f"{recvlog_file} 已删除")
+
+            break  # 成功后跳出重试循环
+
+        except subprocess.CalledProcessError:
+            retries += 1
+            print(f"镜像 {tar_file} 传输失败，尝试重试 {retries}/{max_retries} 次")
+            if retries < max_retries:
+                time.sleep(retry_delay)  # 等待一段时间再重试
+            else:
+                print(f"传输失败超过最大重试次数，跳过镜像 {image_name}")
+                with open(log_file, "a") as log:
+                    log.write(f"{image_name} 传输失败\n")
+                break  # 超过最大重试次数后，跳过这个镜像
+
+    print(f"==== 镜像 {image_name} 传输完毕  ====")
+
+
+# 创建线程池
+with ThreadPoolExecutor() as executor:
+    # 遍历每一行数据，自动构建镜像
+    for index, row in df.iterrows():
+        image_name = row['镜像名']
+        base_image = row['基础镜像']
+        framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
+        other_dependencies = row['其他依赖包']
+        conda_url = row['conda url']  # 获取conda URL
+
+        # 处理 NaN 情况：确保 base_image 是字符串
+        if pd.isna(base_image):
+            print(f"基础镜像信息缺失，跳过该行: {image_name}")
+            continue
+
+        # 提取 torchvision 和 torchaudio 版本号
+        torchvision_version = None
+        torchaudio_version = None
+        if pd.notna(other_dependencies):
+            # 使用正则表达式提取torchvision和torchaudio版本
+            match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
+            match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
+            if match_vision:
+                torchvision_version = match_vision.group(1)
+            if match_audio:
+                torchaudio_version = match_audio.group(1)
+
+        # 如果未找到torchvision或torchaudio的版本，默认设置为空
+        if torchvision_version is None:
+            torchvision_version = "未找到版本号"
+        if torchaudio_version is None:
+            torchaudio_version = "未找到版本号"
+
+        # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
+        if isinstance(base_image, str):
+            if "pytorch" in image_name:
+                if "pytorch/pytorch" in base_image:
+                    # 构建 PyTorch 镜像的命令
+                    build_command = f"""
+                    cd build_space && \
+                    ./build_ubuntu.sh jupyterlab {image_name} {base_image}
+                    """
+                else:
+                    # 构建 NVIDIA 镜像的命令
+                    build_command = f"""
+                    cd build_space && \
+                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                    TORCH_VERSION="{framework_version}" \
+                    TORCHVISION_VERSION="{torchvision_version}" \
+                    TORCHAUDIO_VERSION="{torchaudio_version}" \
+                    CONDA_URL="{conda_url}"
+                    """
+            elif "tensorflow" in image_name:
+                build_command = f"""
+                cd build_space && \
+                ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                TENSORFLOW_VERSION="{framework_version}" \
+                CONDA_URL="{conda_url}"
+                """
+
+        # 打印构建命令（用于调试）
+        print(build_command)
+
+        # 执行构建命令，捕获异常
+        try:
+            print(f"==== 镜像 {image_name} 开始构建  ====")
+            subprocess.run(build_command, shell=True, check=True)
+        except subprocess.CalledProcessError:
+            print(f"==== 镜像 {image_name} 构建失败，跳过该镜像 ====")
+            continue  # 继续执行下一个镜像
+
+        # 创建与镜像名称对应的文件夹，用于保存测试结果
+        image_result_dir = os.path.join(result_dir, image_name.replace('/', '_'))
+
+        # 如果目录已存在，先删除再创建
+        if os.path.exists(image_result_dir):
+            shutil.rmtree(image_result_dir)
+
+        # 重新创建目录
+        os.makedirs(image_result_dir, exist_ok=True)
+
+        # 执行测试并将日志保存到相应的目录
+        test_commands = [
+            f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
+            f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
+            f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
+        ]
+        if "pytorch" in image_name:
+            test_commands.append(f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
+
+        # 执行测试命令
+        for test_command in test_commands:
+            print(f"执行测试: {test_command}")
+            subprocess.run(test_command, shell=True)
+
+        # 生成打包后的镜像文件名，替换 ":" 为 "-" 并添加 ".tar" 后缀
+        tar_file = f"{image_name.replace(':', '-')}.tar"
+
+        # 提交打包和传输任务到后台线程池，继续执行下一个构建任务
+        executor.submit(package_and_transfer, image_name, tar_file, image_result_dir)
+
+
--- a/build_space/Dockerfile.jupyterlab_ubuntu
+++ b/build_space/Dockerfile.jupyterlab_ubuntu
@@ -80,8 +80,7 @@ RUN if [ -n "$TORCH_VERSION" ];then \
 RUN if [ -n "$TENSORFLOW_VERSION" ]; then \
    tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ); \
    [ "$tf_version_minor" == "2.13" ] || [ "$tf_version_minor" == "2.18" ] && tensorflow_text_version=$tf_version_minor.0rc0 || tensorflow_text_version=$tf_version_minor.*; \
-    pip install --no-cache-dir tensorflow[and-cuda]==$TENSORFLOW_VERSION tensorflow-text==$tensorflow_text_version tensorflow-hub && \
-#    pip install --no-cache-dir "numpy<2" ; fi
+    pip install --no-cache-dir tensorflow[and-cuda]==$TENSORFLOW_VERSION tensorflow-text==$tensorflow_text_version tensorflow-hub; fi


 # ----- paddlepaddle install -----

--- a/conf/__init__.py
+++ b/conf/__init__.py
--- a/conf/__pycache__/__init__.cpython-310.pyc
+++ b/conf/__pycache__/__init__.cpython-310.pyc
--- a/conf/__pycache__/config.cpython-310.pyc
+++ b/conf/__pycache__/config.cpython-310.pyc
--- a/conf/config.py
+++ b/conf/config.py
+TORCH_CUDNN_CONFIG = {
+    "12": "9",
+    "11": "8",
+    "10": "7",
+    "9": "7"
+}
+
+# cuda大版本（12.4）：cuda详细版本（12.4.1），cudnn版本（有可能为空）
+NVIDIA_CUDA_CUDNN_VERSION_CONFIG = {
+    "12.4": ["12.4.1", ""],
+    "12.1": ["12.1.0", "8"]
+
+}
--- a/get_args.py
+++ b/get_args.py
+import pandas as pd
+import argparse
+from conf import config
+
+# nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+# pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
+BASE_NVIDIA_IMAGE_TAG = "nvidia/cuda:{cuda_version}-cudnn{cudnn_version}-{tag}-{op_system}"
+BASE_TORCH_IMAGE_TAG = "pytorch/pytorch:{torch_version}-cuda{cuda_version}-cudnn{cudnn_version}-{tag}"
+
+
+def generate():
+    data = pd.read_csv(args.input_csv)
+
+    for index, row in data.iterrows():
+        op_system = row["操作系统"]
+        cuda_version = row["Runtime版本"].replace("cuda", "")
+        cudnn_version = config.CUDNN_CONFIG[cuda_version.split(".")[0]]
+        torch_version = row["框架版本"]
+        python_version = row["Python版本"]
+        if args.devel_image:
+            tag = "devel"
+        else:
+            tag = "runtime"
+        if args.base_image_from == "nvidia":
+            base_image_tag = BASE_NVIDIA_IMAGE_TAG.format(cuda_version=cuda_version,
+                                                          cudnn_version=cudnn_version,
+                                                          tag=tag,
+                                                          op_system=op_system)
+        else:
+            base_image_tag = BASE_TORCH_IMAGE_TAG.format(cuda_version=cuda_version,
+                                                         cudnn_version=cudnn_version,
+                                                         tag=tag,
+                                                         torch_version=torch_version)
+        print(base_image_tag)
+        break
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate docker build args.')
+
+
+    parser.add_argument('--input-csv', type=str, default="AI内容协作表_GPU基础镜像(聂释隆).csv",
+                        help='input csv file path')
+    parser.add_argument('--base-image-from', type=str, default="nvidia", choices=["nvidia", "torch"],
+                        help='choice base image from nvidia or torch')
+    parser.add_argument('--devel-image', action='store_true', default=False,
+                        help='build devel image')
+
+    args = parser.parse_args()
+    generate()
--- a/get_image_tag.py
+++ b/get_image_tag.py
+import requests
+
+
+def get_docker_hub_tags(repository, username=None, token=None):
+    # 注意：这里的URL是虚构的，用于示例。你需要替换为Docker Hub API的实际URL。
+    # 如果API需要认证，你可能需要将用户名和令牌作为请求的一部分发送。
+    api_url = f"https://registry-1.docker.io/v2/{username or ''}{repository}/tags/list"
+    headers = {
+        'Authorization': f'Bearer {token}' if token else '',
+        'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
+    }
+
+    try:
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()  # 如果响应状态码不是200，则抛出HTTPError异常
+
+        # 解析响应体，这里假设返回的是JSON格式，并包含tags列表
+        tags = response.json().get('tags', [])
+        return tags
+    except requests.RequestException as e:
+        print(f"Error fetching tags: {e}")
+        return []
+
+    # 替换以下变量为你的Docker Hub用户名（如果有）、仓库名和（可选的）API令牌
+
+
+username = "your_username"  # 如果仓库是私有的或需要认证，则提供
+repository = "library/ubuntu"  # 例如，使用官方Ubuntu镜像
+token = "your_docker_hub_token"  # 如果API需要认证，则提供
+
+tags = get_docker_hub_tags(repository, username, token)
+print("Tags:", tags)
\ No newline at end of file