Commit 57dd0583 authored by chenpangpang's avatar chenpangpang
Browse files

Merge branch 'tensorflow' into 'dev'

Tensorflow

See merge request !3
parents b99980e6 d445a280
import pandas as pd
import re
import subprocess
import os
import shutil
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import argparse
import logging
class MyLogger:
def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
self.logger_name = logger_name
self.log_file = log_file
self.vlog = logging.getLogger(logger_name)
self.vlog.setLevel(level)
self.file_handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
self.file_handler.setFormatter(formatter)
self.vlog.addHandler(self.file_handler)
if console_handler:
self.console_handler = logging.StreamHandler()
self.console_handler.setFormatter(formatter)
self.console_handler.setLevel(level)
self.console_handler.setLevel(level)
self.vlog.addHandler(self.console_handler)
def get_vlog(self):
return self.vlog
def __del__(self):
self.vlog.removeHandler(self.file_handler)
if self.console_handler is not None:
self.vlog.removeHandler(self.console_handler)
# 定义一个用于打包和传输的函数
def package_and_transfer(image_name, tar_file, image_result_dir, logger):
# 打包镜像
save_commands = [
f"sh script/save.sh {image_name} > /dev/null 2>&1",
f"mv {tar_file} {image_result_dir}/"
]
for save_command in save_commands:
logger.info(f"打包镜像: {save_command}")
subprocess.run(save_command, shell=True)
logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")
# 准备执行远程传输命令
recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no" {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
# 打印并执行 rsync 远程传输命令
logger.info(f"远程传输命令: {rsync_command}")
retries = 0
while retries < args.trans_retry_max_num:
try:
subprocess.run(rsync_command, shell=True, check=True)
logger.info(f"镜像 {tar_file} 传输成功,日志保存到 {recvlog_file}")
# 传输成功后,将镜像名称追加到日志文件中
with open(args.ok_file, "a") as log:
log.write(f"{image_name} 成功传输\n")
# 传输成功后删除 .tar 文件
tar_file_path = os.path.join(image_result_dir, tar_file)
if os.path.exists(tar_file_path):
os.remove(tar_file_path)
logger.info(f"{tar_file_path} 已删除")
# 传输成功后删除 recvlog 文件
if os.path.exists(recvlog_file):
os.remove(recvlog_file)
logger.info(f"{recvlog_file} 已删除")
break # 成功后跳出重试循环
except subprocess.CalledProcessError:
retries += 1
logger.info(f"镜像 {tar_file} 传输失败,尝试重试 {retries}/{args.trans_retry_num} 次")
if retries < args.trans_retry_num:
time.sleep(args.trans_retry_delay) # 等待一段时间再重试
else:
logger.warning(f"传输失败超过最大重试次数,跳过镜像 {image_name}")
with open(args.ok_file, "a") as log:
log.write(f"{image_name} 传输失败\n")
break # 超过最大重试次数后,跳过这个镜像
logger.info(f"==== 镜像 {image_name} 传输完毕 ====")
def run():
# 读取Excel文件
df = pd.read_excel(args.input_file)
os.makedirs(args.log_dir, exist_ok=True)
# 创建线程池
with ThreadPoolExecutor() as executor:
# 遍历每一行数据,自动构建镜像
for index, row in df.iterrows():
image_name = row['镜像名']
base_image = row['基础镜像']
framework_version = row['框架版本'] # 直接获取框架版本作为 framework_VERSION
other_dependencies = row['其他依赖包']
conda_url = row['conda url'] # 获取conda URL
# 日志文件
if os.path.exists(os.path.join(args.log_dir, image_name)):
shutil.rmtree(os.path.join(args.log_dir, image_name))
os.makedirs(os.path.join(args.log_dir, image_name))
my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
logger = my_logger.get_vlog()
# 处理 NaN 情况:确保 base_image 是字符串
if pd.isna(base_image):
logger.error(f"基础镜像信息缺失,跳过该行: {image_name}")
continue
# 提取 torchvision 和 torchaudio 版本号
torchvision_version = None
torchaudio_version = None
if pd.notna(other_dependencies):
# 使用正则表达式提取torchvision和torchaudio版本
match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
if match_vision:
torchvision_version = match_vision.group(1)
if match_audio:
torchaudio_version = match_audio.group(1)
# 如果未找到torchvision或torchaudio的版本,默认设置为空
if torchvision_version is None or torchaudio_version is None:
torchvision_version = "未找到版本号"
if torchaudio_version is None:
torchaudio_version = "未找到版本号"
# 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
if isinstance(base_image, str):
if "pytorch" in image_name:
if "pytorch/pytorch" in base_image:
# 构建 PyTorch 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
else:
# 构建 NVIDIA 镜像的命令
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TORCH_VERSION="{framework_version}" \
TORCHVISION_VERSION="{torchvision_version}" \
TORCHAUDIO_VERSION="{torchaudio_version}" \
CONDA_URL="{conda_url}" \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
elif "tensorflow" in image_name:
build_command = f"""
cd build_space && \
./build_ubuntu.sh jupyterlab {image_name} {base_image} \
TENSORFLOW_VERSION="{framework_version}" \
CONDA_URL="{conda_url}" \
2>&1 | tee ../{args.log_dir}/{image_name}/build.log
"""
# 打印构建命令(用于调试)
logger.info(build_command)
# 执行构建命令,捕获异常
try:
logger.info(f"==== 镜像 {image_name} 开始构建 ====")
subprocess.run(build_command, shell=True, check=True)
except subprocess.CalledProcessError:
logger.info(f"==== 镜像 {image_name} 构建失败,跳过该镜像 ====")
continue # 继续执行下一个镜像
# 创建与镜像名称对应的文件夹,用于保存测试结果
image_result_dir = os.path.join(args.log_dir, image_name)
# 执行测试并将日志保存到相应的目录
test_commands = [
f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
]
if "pytorch" in image_name:
test_commands.append(
f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
# # 执行测试命令
# for test_command in test_commands:
# logger.info(f"执行测试: {test_command}")
# subprocess.run(test_command, shell=True)
# 生成打包后的镜像文件名,替换 ":" 为 "-" 并添加 ".tar" 后缀
tar_file = f"{image_name.replace(':', '-')}.tar"
if not args.no_save_trans:
# 提交打包和传输任务到后台线程池,继续执行下一个构建任务
executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
help='a excel file with images to build.')
parser.add_argument('--index', type=str,
help='the indexes for images to build, separated by ","')
parser.add_argument('--num', type=int,
help='the number of images to build')
parser.add_argument('--log-dir', type=str, default="logs",
help='logs directory')
parser.add_argument('--ok-file', type=str, default="ok.txt",
help='the file of succeed images')
parser.add_argument('--trans-retry-max-num', type=int, default=3,
help='transform retry max num')
parser.add_argument('--trans-retry-delay', type=int, default=5,
help='transform delay seconds')
parser.add_argument('--des-path', type=str,
default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
help='destination path in scnet')
parser.add_argument("--no-save-trans", action="store_true",
help="do not save and transform image")
args = parser.parse_args()
run()
......@@ -14,6 +14,7 @@ ARG TORCHAUDIO_VERSION
# ----- tensorflow args -----
ARG TENSORFLOW_VERSION
ARG IMAGE_TAG
#ARG CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_24.7.1-0-Linux-x86_64.sh"
# ----- paddlepaddle args -----
......@@ -62,15 +63,6 @@ RUN if [ -n "$CONDA_URL" ];then \
&& cd .. \
&& rm -rf /tmp/conda-extension; fi
#RUN if [ $BASE_IMAGE_IS_TORCH -eq 0 ];then \
# mkdir -p /tmp/conda-extension \
# && cd /tmp/conda-extension \
# && wget $CONDA_URL \
# && bash $(echo $CONDA_URL | awk -F "/" '{print $NF}') -b -p /opt/conda \
# && echo "export PATH=\$PATH:/opt/conda/bin" >> /etc/profile.d/sothisai.sh \
# && cd .. \
# && rm -rf /tmp/conda-extension; fi
ENV PATH=$PATH:/opt/conda/bin
RUN pip3 install --upgrade pip ${SOURCES} || pip install --upgrade pip ${SOURCES} \
......@@ -92,11 +84,23 @@ RUN if [ -n "$TORCH_VERSION" ];then \
pip install --no-cache-dir transformers accelerate diffusers; fi
RUN if [ -n "$TENSORFLOW_VERSION" ]; then \
tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ) && \
pip install --no-cache-dir tensorflow[and-cuda]==$TENSORFLOW_VERSION \
tensorflow-text==$tf_version_minor.* tf-models-official==$tf_version_minor.* && \
apt-get update -y && \
apt-get install --no-install-recommends -y libnvinfer8 libnvjitlink-12-3 libnvjpeg-12-3 libnvinfer-plugin8; fi
tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ); \
[ "$tf_version_minor" == "2.13" ] || [ "$tf_version_minor" == "2.18" ] && tensorflow_text_version=$tf_version_minor.0rc0 || tensorflow_text_version=$tf_version_minor.*; \
pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -i https://pypi.org/simple/ && \
pip install --no-cache-dir tensorflow-text==$tensorflow_text_version tensorflow-hub; fi
# 2.16.1必须手动添加环境变量
RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;fi && \
if [ $TENSORFLOW_VERSION == "2.8.0" ] || [ $TENSORFLOW_VERSION == "2.7.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir protobuf==3.20.*;fi && \
if [ $TENSORFLOW_VERSION == "2.4.0" ] || [ $TENSORFLOW_VERSION == "2.5.0" ] || [ $TENSORFLOW_VERSION == "2.6.0" ]; then \
pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;fi && \
if [ $TENSORFLOW_VERSION == "2.8.0" ] || [ $TENSORFLOW_VERSION == "2.10.0" ] || [ $TENSORFLOW_VERSION == "2.11.0" ] || [ $TENSORFLOW_VERSION == "2.9.0" ] || [ $TENSORFLOW_VERSION == "2.9.3" ] || [ $TENSORFLOW_VERSION == "2.14.0" ]; then \
pip install --no-cache-dir "numpy<2"; fi
# ----- paddlepaddle install -----
RUN if [ -n "$PADDLEPADDLE_VERSION" ] && [ -n "$PADDLE_URL" ]; then \
......@@ -114,7 +118,6 @@ RUN if [ -n "$PADDLENLP_VERSION" ] ; then \
pip install --upgrade ppdiffusers --no-deps && rm -r /root/.cache/pip; \
fi
COPY ./python-requirements.txt /tmp/
RUN pip install --no-cache-dir -r /tmp/python-requirements.txt
......@@ -150,7 +153,6 @@ RUN jupytersite="$(python3 -m pip show jupyterlab | grep -i '^location' | awk '{
&& ssh-keygen -A \
&& sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
EXPOSE 8888
......
......@@ -9,9 +9,10 @@ build_args=" --build-arg BASE_IMAGE=$base_image"
if [ ${base_image%%:*} = "pytorch/pytorch" ]; then
build_args="$build_args --build-arg BASE_IMAGE_IS_TORCH=1 "
fi
build_args="$build_args --build-arg IMAGE_TAG=$image_tag"
for arg in ${*:4}
do
build_args="$build_args --build-arg $arg "
build_args="$build_args --build-arg $arg"
done
tmp_dockerfile="Dockerfile.${RANDOM}"
......
TORCH_CUDNN_CONFIG = {
"12": "9",
"11": "8",
"10": "7",
"9": "7"
}
# cuda大版本(12.4):cuda详细版本(12.4.1),cudnn版本(有可能为空)
NVIDIA_CUDA_CUDNN_VERSION_CONFIG = {
"12.4": ["12.4.1", ""],
"12.1": ["12.1.0", "8"]
}
import pandas as pd
import argparse
from conf import config
# nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
# pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
BASE_NVIDIA_IMAGE_TAG = "nvidia/cuda:{cuda_version}-cudnn{cudnn_version}-{tag}-{op_system}"
BASE_TORCH_IMAGE_TAG = "pytorch/pytorch:{torch_version}-cuda{cuda_version}-cudnn{cudnn_version}-{tag}"
def generate():
data = pd.read_csv(args.input_csv)
for index, row in data.iterrows():
op_system = row["操作系统"]
cuda_version = row["Runtime版本"].replace("cuda", "")
cudnn_version = config.CUDNN_CONFIG[cuda_version.split(".")[0]]
torch_version = row["框架版本"]
python_version = row["Python版本"]
if args.devel_image:
tag = "devel"
else:
tag = "runtime"
if args.base_image_from == "nvidia":
base_image_tag = BASE_NVIDIA_IMAGE_TAG.format(cuda_version=cuda_version,
cudnn_version=cudnn_version,
tag=tag,
op_system=op_system)
else:
base_image_tag = BASE_TORCH_IMAGE_TAG.format(cuda_version=cuda_version,
cudnn_version=cudnn_version,
tag=tag,
torch_version=torch_version)
print(base_image_tag)
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate docker build args.')
parser.add_argument('--input-csv', type=str, default="AI内容协作表_GPU基础镜像(聂释隆).csv",
help='input csv file path')
parser.add_argument('--base-image-from', type=str, default="nvidia", choices=["nvidia", "torch"],
help='choice base image from nvidia or torch')
parser.add_argument('--devel-image', action='store_true', default=False,
help='build devel image')
args = parser.parse_args()
generate()
import requests
def get_docker_hub_tags(repository, username=None, token=None):
# 注意:这里的URL是虚构的,用于示例。你需要替换为Docker Hub API的实际URL。
# 如果API需要认证,你可能需要将用户名和令牌作为请求的一部分发送。
api_url = f"https://registry-1.docker.io/v2/{username or ''}{repository}/tags/list"
headers = {
'Authorization': f'Bearer {token}' if token else '',
'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
}
try:
response = requests.get(api_url, headers=headers)
response.raise_for_status() # 如果响应状态码不是200,则抛出HTTPError异常
# 解析响应体,这里假设返回的是JSON格式,并包含tags列表
tags = response.json().get('tags', [])
return tags
except requests.RequestException as e:
print(f"Error fetching tags: {e}")
return []
# 替换以下变量为你的Docker Hub用户名(如果有)、仓库名和(可选的)API令牌
username = "your_username" # 如果仓库是私有的或需要认证,则提供
repository = "library/ubuntu" # 例如,使用官方Ubuntu镜像
token = "your_docker_hub_token" # 如果API需要认证,则提供
tags = get_docker_hub_tags(repository, username, token)
print("Tags:", tags)
\ No newline at end of file
......@@ -24,7 +24,23 @@ if [[ "$1" == *"pytorch"* ]]; then
print(\"torchaudio version: \", torchaudio.__version__);
"
elif [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all $1 python -c \
tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
# 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
$1 python -c "import os; \
os.system(\"cat /etc/issue\"); \
import sys; \
print(\"python version: \", sys.version); \
import tensorflow as tf; \
print(\"tensorflow version: \", tf.__version__); \
print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
os.system('nvcc -V | tail -n 2')
";
else docker run --rm --platform=linux/amd64 --gpus all $1 python -c \
"import os; \
os.system(\"cat /etc/issue\"); \
import sys; \
......@@ -33,7 +49,7 @@ elif [[ "$1" == *"tensorflow"* ]]; then
print(\"tensorflow version: \", tf.__version__); \
print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
os.system('nvcc -V | tail -n 2')
"
"; fi
elif [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python base_test.py
......@@ -43,3 +59,4 @@ else
exit 1
fi
......@@ -9,9 +9,19 @@ fi
if [[ "$1" == *"pytorch"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/gpt2 $1 python infer.py; fi
if [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi
if [[ "$1" == *"tensorflow"* ]]; then
tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
# 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
-v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py
else
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi; fi
if [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python text.py; fi
......@@ -9,10 +9,20 @@ fi
if [[ "$1" == *"pytorch"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/stable-diffusion-v1-4 $1 python infer.py; fi
if [[ "$1" == *"tensorflow"* ]]; then
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi
if [[ "$1" == *"tensorflow"* ]]; then
tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
# 当tensorflow版本为2.16.1时,不添加环境变量找不到cuda,所以需要这样执行验证。在正常交互式启动容器时,会默认激活/etc/bash.bashrc,可以正常找到cuda
if [[ "$tensorflow_version" == "2.16.1" ]]; then
python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
docker run --rm --platform=linux/amd64 --gpus all \
-e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
-e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
-v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py
else
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi; fi
if [[ "$1" == *"paddle"* ]]; then
TARGET_DIR=gpu-base-image-test/paddletest
docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python image.py; fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment