"tests/experimental/vscode:/vscode.git/clone" did not exist on "21464e055b617b9a1c241d440f87c8efc2745e75"
Commit 54ccc164 authored by chenpangpang's avatar chenpangpang
Browse files

Merge branch 'dev' into update-jupyter-plugin

parents e5afa281 8a9a5632
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
```bash ```bash
cd build_space && \ cd build_space && \
./build_ubuntu.sh jupyterlab \ ./build_ubuntu.sh jupyterlab \
juypterlab-pytorch:2.3.1-py3.10-cuda12.1-ubuntu22.04-devel \ jupyterlab-pytorch:2.3.1-py3.10-cuda12.1-ubuntu22.04-devel \
pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel
``` ```
- 参数1: ide,不需要改动 - 参数1: ide,不需要改动
...@@ -53,7 +53,6 @@ ...@@ -53,7 +53,6 @@
- TENSORFLOW_VERSION:tensorflow版本 - TENSORFLOW_VERSION:tensorflow版本
- CONDA_URL:安装conda的url - CONDA_URL:安装conda的url
### 相关链接 ### 相关链接
- pytorch镜像(**选择devel镜像**):https://hub.docker.com/r/pytorch/pytorch/tags - pytorch镜像(**选择devel镜像**):https://hub.docker.com/r/pytorch/pytorch/tags
- nvidia镜像(**选择devel镜像**):https://hub.docker.com/r/nvidia/cuda/tags - nvidia镜像(**选择devel镜像**):https://hub.docker.com/r/nvidia/cuda/tags
......
import pandas as pd
import re
import subprocess
import os
import shutil
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import argparse
import logging
class MyLogger:
def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
self.logger_name = logger_name
self.log_file = log_file
self.vlog = logging.getLogger(logger_name)
self.vlog.setLevel(level)
self.file_handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
self.file_handler.setFormatter(formatter)
self.vlog.addHandler(self.file_handler)
if console_handler:
self.console_handler = logging.StreamHandler()
self.console_handler.setFormatter(formatter)
self.console_handler.setLevel(level)
self.console_handler.setLevel(level)
self.vlog.addHandler(self.console_handler)
def get_vlog(self):
return self.vlog
def __del__(self):
self.vlog.removeHandler(self.file_handler)
if self.console_handler is not None:
self.vlog.removeHandler(self.console_handler)
# 定义一个用于打包和传输的函数
def package_and_transfer(image_name, tar_file, image_result_dir, logger):
# 打包镜像
save_commands = [
f"sh script/save.sh {image_name} > /dev/null 2>&1",
f"mv {tar_file} {image_result_dir}/"
]
for save_command in save_commands:
logger.info(f"打包镜像: {save_command}")
subprocess.run(save_command, shell=True)
logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")
# 准备执行远程传输命令
recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no" {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
# 打印并执行 rsync 远程传输命令
logger.info(f"远程传输命令: {rsync_command}")
retries = 0
while retries < args.trans_retry_max_num:
try:
subprocess.run(rsync_command, shell=True, check=True)
logger.info(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}")
# 传输成功后,将镜像名称追加到日志文件中
with open(args.ok_file, "a") as log:
log.write(f"{image_name} 成功传输\n")
# 传输成功后删除 .tar 文件
tar_file_path = os.path.join(image_result_dir, tar_file)
if os.path.exists(tar_file_path):
os.remove(tar_file_path)
logger.info(f"{tar_file_path} 已删除")
# 传输成功后删除 recvlog 文件
if os.path.exists(recvlog_file):
os.remove(recvlog_file)
logger.info(f"{recvlog_file} 已删除")
break # 成功后跳出重试循环
except subprocess.CalledProcessError:
retries += 1
logger.info(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{args.trans_retry_num} 次")
if retries < args.trans_retry_num:
time.sleep(args.trans_retry_delay) # 等待一段时间再重试
else:
logger.warning(f"传输失败超过最大重试次数,跳过镜像 {image_name}")
with open(args.ok_file, "a") as log:
log.write(f"{image_name} 传输失败\n")
break # 超过最大重试次数后,跳过这个镜像
logger.info(f"==== 镜像 {image_name} 传输完毕 ====")
def run():
# 读取Excel文件
df = pd.read_excel(args.input_file)
os.makedirs(args.log_dir, exist_ok=True)
# 创建线程池
with ThreadPoolExecutor() as executor:
# 遍历每一行数据,自动构建镜像
for index, row in df.iterrows():
image_name = row['镜像名']
base_image = row['基础镜像']
framework_version = row['框架版本'] # 直接获取框架版本作为 framework_VERSION
other_dependencies = row['其他依赖包']
conda_url = row['conda url'] # 获取conda URL
# 日志文件
if os.path.exists(os.path.join(args.log_dir, image_name)):
shutil.rmtree(os.path.join(args.log_dir, image_name))
os.makedirs(os.path.join(args.log_dir, image_name))
my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
logger = my_logger.get_vlog()
# 处理 NaN 情况:确保 base_image 是字符串
if pd.isna(base_image):
logger.error(f"基础镜像信息缺失,跳过该行: {image_name}")
continue
# 提取 torchvision 和 torchaudio 版本号
torchvision_version = None
torchaudio_version = None
if pd.notna(other_dependencies):
# 使用正则表达式提取torchvision和torchaudio版本
match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
if match_vision:
torchvision_version = match_vision.group(1)
if match_audio:
torchaudio_version = match_audio.group(1)
# 如果未找到torchvision或torchaudio的版本,默认设置为空
if torchvision_version is None or torchaudio_version is None:
torchvision_version = "未找到版本号"
if torchaudio_version is None:
torchaudio_version = "未找到版本号"
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if isinstance(base_image, str):
if "pytorch" in image_name:
if "pytorch/pytorch" in base_image:
# 构建 PyTorch 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
else:
# 构建 NVIDIA 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TORCH_VERSION="{framework_version}" \
TORCHVISION_VERSION="{torchvision_version}" \
TORCHAUDIO_VERSION="{torchaudio_version}" \
CONDA_URL="{conda_url}" \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
elif "tensorflow" in image_name:
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TENSORFLOW_VERSION="{framework_version}" \
CONDA_URL="{conda_url}" \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
# 打印构建命令(用于调试)
logger.info(build_command)
# 执行构建命令,捕获异常
try:
logger.info(f"==== 镜像 {image_name} 开始构建 ====")
subprocess.run(build_command, shell=True, check=True)
except subprocess.CalledProcessError:
logger.info(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
continue # 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir = os.path.join(args.log_dir, image_name)
# 执行测试并将日志保存到相应的目录
test_commands = [
f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
]
if "pytorch" in image_name:
test_commands.append(
f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
# 执行测试命令
for test_command in test_commands:
logger.info(f"执行测试: {test_command}")
subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file = f"{image_name.replace(':', '-')}.tar"
if not args.no_save_trans:
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
help='a excel file with images to build.')
parser.add_argument('--index', type=str,
help='the indexes for images to build, separated by ","')
parser.add_argument('--num', type=int,
help='the number of images to build')
parser.add_argument('--log-dir', type=str, default="logs",
help='logs directory')
parser.add_argument('--ok-file', type=str, default="ok.txt",
help='the file of succeed images')
parser.add_argument('--trans-retry-max-num', type=int, default=3,
help='transform retry max num')
parser.add_argument('--trans-retry-delay', type=int, default=5,
help='transform delay seconds')
parser.add_argument('--des-path', type=str,
default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
help='destination path in scnet')
parser.add_argument("--no-save-trans", action="store_true",
help="do not save and transform image")
args = parser.parse_args()
run()
TORCH_CUDNN_CONFIG = {
"12": "9",
"11": "8",
"10": "7",
"9": "7"
}
# cuda大版本(12.4):cuda详细版本(12.4.1),cudnn版本(有可能为空)
NVIDIA_CUDA_CUDNN_VERSION_CONFIG = {
"12.4": ["12.4.1", ""],
"12.1": ["12.1.0", "8"]
}
import pandas as pd
import argparse
from conf import config
# nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
# pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
BASE_NVIDIA_IMAGE_TAG = "nvidia/cuda:{cuda_version}-cudnn{cudnn_version}-{tag}-{op_system}"
BASE_TORCH_IMAGE_TAG = "pytorch/pytorch:{torch_version}-cuda{cuda_version}-cudnn{cudnn_version}-{tag}"
def generate():
data = pd.read_csv(args.input_csv)
for index, row in data.iterrows():
op_system = row["操作系统"]
cuda_version = row["Runtime版本"].replace("cuda", "")
cudnn_version = config.CUDNN_CONFIG[cuda_version.split(".")[0]]
torch_version = row["框架版本"]
python_version = row["Python版本"]
if args.devel_image:
tag = "devel"
else:
tag = "runtime"
if args.base_image_from == "nvidia":
base_image_tag = BASE_NVIDIA_IMAGE_TAG.format(cuda_version=cuda_version,
cudnn_version=cudnn_version,
tag=tag,
op_system=op_system)
else:
base_image_tag = BASE_TORCH_IMAGE_TAG.format(cuda_version=cuda_version,
cudnn_version=cudnn_version,
tag=tag,
torch_version=torch_version)
print(base_image_tag)
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate docker build args.')
parser.add_argument('--input-csv', type=str, default="AI内容协作表_GPU基础镜像(聂释隆).csv",
help='input csv file path')
parser.add_argument('--base-image-from', type=str, default="nvidia", choices=["nvidia", "torch"],
help='choice base image from nvidia or torch')
parser.add_argument('--devel-image', action='store_true', default=False,
help='build devel image')
args = parser.parse_args()
generate()
import requests
def get_docker_hub_tags(repository, username=None, token=None):
# 注意:这里的URL是虚构的,用于示例。你需要替换为Docker Hub API的实际URL。
# 如果API需要认证,你可能需要将用户名和令牌作为请求的一部分发送。
api_url = f"https://registry-1.docker.io/v2/{username or ''}{repository}/tags/list"
headers = {
'Authorization': f'Bearer {token}' if token else '',
'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
}
try:
response = requests.get(api_url, headers=headers)
response.raise_for_status() # 如果响应状态码不是200,则抛出HTTPError异常
# 解析响应体,这里假设返回的是JSON格式,并包含tags列表
tags = response.json().get('tags', [])
return tags
except requests.RequestException as e:
print(f"Error fetching tags: {e}")
return []
# 替换以下变量为你的Docker Hub用户名(如果有)、仓库名和(可选的)API令牌
username = "your_username" # 如果仓库是私有的或需要认证,则提供
repository = "library/ubuntu" # 例如,使用官方Ubuntu镜像
token = "your_docker_hub_token" # 如果API需要认证,则提供
tags = get_docker_hub_tags(repository, username, token)
print("Tags:", tags)
\ No newline at end of file
...@@ -24,7 +24,23 @@ if [[ "$1" == *"pytorch"* ]]; then ...@@ -24,7 +24,23 @@ if [[ "$1" == *"pytorch"* ]]; then
print(\"torchaudio version: \", torchaudio.__version__); print(\"torchaudio version: \", torchaudio.__version__);
" "
elif [[ "$1" == *"tensorflow"* ]]; then elif [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all $1 python -c \ tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
# 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
$1 python -c "import os; \
os.system(\"cat /etc/issue\"); \
import sys; \
print(\"python version: \", sys.version); \
import tensorflow as tf; \
print(\"tensorflow version: \", tf.__version__); \
print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
os.system('nvcc -V | tail -n 2')
";
else docker run --rm --platform=linux/amd64 --gpus all $1 python -c \
"import os; \ "import os; \
os.system(\"cat /etc/issue\"); \ os.system(\"cat /etc/issue\"); \
import sys; \ import sys; \
...@@ -33,12 +49,14 @@ elif [[ "$1" == *"tensorflow"* ]]; then ...@@ -33,12 +49,14 @@ elif [[ "$1" == *"tensorflow"* ]]; then
print(\"tensorflow version: \", tf.__version__); \ print(\"tensorflow version: \", tf.__version__); \
print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \ print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
os.system('nvcc -V | tail -n 2') os.system('nvcc -V | tail -n 2')
" "; fi
elif [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python base_test.py
else else
echo "ERROR: no supported test shell" echo "ERROR: no supported test shell"
exit 1 exit 1
fi fi
...@@ -6,8 +6,22 @@ if [ -z "$1" ]; then ...@@ -6,8 +6,22 @@ if [ -z "$1" ]; then
exit 1 exit 1
fi fi
if [[ "$1" == *"pytorch"* ]]; then \ if [[ "$1" == *"pytorch"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/gpt2 $1 python infer.py; fi docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/gpt2 $1 python infer.py; fi
if [[ "$1" == *"tensorflow"* ]]; then \ if [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
\ No newline at end of file # 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
-v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py
else
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi; fi
if [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python text.py; fi
...@@ -6,8 +6,23 @@ if [ -z "$1" ]; then ...@@ -6,8 +6,23 @@ if [ -z "$1" ]; then
exit 1 exit 1
fi fi
if [[ "$1" == *"pytorch"* ]]; then \ if [[ "$1" == *"pytorch"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/stable-diffusion-v1-4 $1 python infer.py; fi docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/stable-diffusion-v1-4 $1 python infer.py; fi
if [[ "$1" == *"tensorflow"* ]]; then \ if [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
# 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
-v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py
else
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi; fi
if [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python image.py; fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment