Merge branch 'tensorflow' into 'dev'

Tensorflow See merge request !3

Merge branch 'tensorflow' into 'dev'
Tensorflow See merge request !3
57dd0583 · chenpangpang · b99980e6 · d445a280 · 57dd0583 · 57dd0583
Commit 57dd0583 authored Oct 29, 2024 by chenpangpang
12 changed files
--- a/auto_build.py
+++ b/auto_build.py
+import pandas as pd
+import re
+import subprocess
+import os
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
+import argparse
+import logging
+
+
+class MyLogger:
+    def __init__(self, logger_name, log_file, console_handler=True, level=logging.INFO):
+        self.logger_name = logger_name
+        self.log_file = log_file
+        self.vlog = logging.getLogger(logger_name)
+        self.vlog.setLevel(level)
+
+        self.file_handler = logging.FileHandler(log_file)
+        formatter = logging.Formatter('%(asctime)s : %(message)s', "%Y-%m-%d %H:%M:%S")
+        self.file_handler.setFormatter(formatter)
+        self.vlog.addHandler(self.file_handler)
+
+        if console_handler:
+            self.console_handler = logging.StreamHandler()
+            self.console_handler.setFormatter(formatter)
+            self.console_handler.setLevel(level)
+            self.console_handler.setLevel(level)
+            self.vlog.addHandler(self.console_handler)
+
+    def get_vlog(self):
+        return self.vlog
+
+    def __del__(self):
+        self.vlog.removeHandler(self.file_handler)
+        if self.console_handler is not None:
+            self.vlog.removeHandler(self.console_handler)
+
+
+# 定义一个用于打包和传输的函数
+def package_and_transfer(image_name, tar_file, image_result_dir, logger):
+    # 打包镜像
+    save_commands = [
+        f"sh script/save.sh {image_name} > /dev/null 2>&1",
+        f"mv {tar_file} {image_result_dir}/"
+    ]
+
+    for save_command in save_commands:
+        logger.info(f"打包镜像: {save_command}")
+        subprocess.run(save_command, shell=True)
+
+    logger.info(f"镜像 {image_name} 已成功打包 {tar_file}")
+
+    # 准备执行远程传输命令
+    recvlog_file = f"{image_name.replace(':', '-')}_recvlog"
+    rsync_command = f'rsync -aP -e "ssh -p 65023 -i my_rsa -o StrictHostKeyChecking=no"  {image_result_dir}/{tar_file} {args.des_path} > {recvlog_file}'
+    # 打印并执行 rsync 远程传输命令
+    logger.info(f"远程传输命令: {rsync_command}")
+    retries = 0
+    while retries < args.trans_retry_max_num:
+        try:
+            subprocess.run(rsync_command, shell=True, check=True)
+            logger.info(f"镜像 {tar_file} 传输成功，日志保存到 {recvlog_file}")
+
+            # 传输成功后，将镜像名称追加到日志文件中
+            with open(args.ok_file, "a") as log:
+                log.write(f"{image_name} 成功传输\n")
+
+            # 传输成功后删除 .tar 文件
+            tar_file_path = os.path.join(image_result_dir, tar_file)
+            if os.path.exists(tar_file_path):
+                os.remove(tar_file_path)
+                logger.info(f"{tar_file_path} 已删除")
+
+            # 传输成功后删除 recvlog 文件
+            if os.path.exists(recvlog_file):
+                os.remove(recvlog_file)
+                logger.info(f"{recvlog_file} 已删除")
+
+            break  # 成功后跳出重试循环
+
+        except subprocess.CalledProcessError:
+            retries += 1
+            logger.info(f"镜像 {tar_file} 传输失败，尝试重试 {retries}/{args.trans_retry_num} 次")
+            if retries < args.trans_retry_num:
+                time.sleep(args.trans_retry_delay)  # 等待一段时间再重试
+            else:
+                logger.warning(f"传输失败超过最大重试次数，跳过镜像 {image_name}")
+                with open(args.ok_file, "a") as log:
+                    log.write(f"{image_name} 传输失败\n")
+                break  # 超过最大重试次数后，跳过这个镜像
+
+    logger.info(f"==== 镜像 {image_name} 传输完毕  ====")
+
+
+def run():
+    # 读取Excel文件
+    df = pd.read_excel(args.input_file)
+    os.makedirs(args.log_dir, exist_ok=True)
+
+    # 创建线程池
+    with ThreadPoolExecutor() as executor:
+        # 遍历每一行数据，自动构建镜像
+        for index, row in df.iterrows():
+            image_name = row['镜像名']
+            base_image = row['基础镜像']
+            framework_version = row['框架版本']  # 直接获取框架版本作为 framework_VERSION
+            other_dependencies = row['其他依赖包']
+            conda_url = row['conda url']  # 获取conda URL
+
+            # 日志文件
+            if os.path.exists(os.path.join(args.log_dir, image_name)):
+                shutil.rmtree(os.path.join(args.log_dir, image_name))
+            os.makedirs(os.path.join(args.log_dir, image_name))
+            my_logger = MyLogger(image_name, os.path.join(args.log_dir, image_name, "run.log"))
+            logger = my_logger.get_vlog()
+            # 处理 NaN 情况：确保 base_image 是字符串
+            if pd.isna(base_image):
+                logger.error(f"基础镜像信息缺失，跳过该行: {image_name}")
+                continue
+
+            # 提取 torchvision 和 torchaudio 版本号
+            torchvision_version = None
+            torchaudio_version = None
+            if pd.notna(other_dependencies):
+                # 使用正则表达式提取torchvision和torchaudio版本
+                match_vision = re.search(r'torchvision-([\d.]+)', other_dependencies)
+                match_audio = re.search(r'torchaudio-([\d.]+)', other_dependencies)
+                if match_vision:
+                    torchvision_version = match_vision.group(1)
+                if match_audio:
+                    torchaudio_version = match_audio.group(1)
+
+            # 如果未找到torchvision或torchaudio的版本，默认设置为空
+            if torchvision_version is None or torchaudio_version is None:
+                torchvision_version = "未找到版本号"
+            if torchaudio_version is None:
+                torchaudio_version = "未找到版本号"
+
+            # 基于 PyTorch 或 NVIDIA 镜像的构建逻辑
+            if isinstance(base_image, str):
+                if "pytorch" in image_name:
+                    if "pytorch/pytorch" in base_image:
+                        # 构建 PyTorch 镜像的命令
+                        build_command = f"""
+                        cd build_space && \
+                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
+                        """
+                    else:
+                        # 构建 NVIDIA 镜像的命令
+                        build_command = f"""
+                        cd build_space && \
+                        ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                        TORCH_VERSION="{framework_version}" \
+                        TORCHVISION_VERSION="{torchvision_version}" \
+                        TORCHAUDIO_VERSION="{torchaudio_version}" \
+                        CONDA_URL="{conda_url}" \
+                        2>&1 | tee ../{args.log_dir}/{image_name}/build.log
+                        """
+                elif "tensorflow" in image_name:
+                    build_command = f"""
+                    cd build_space && \
+                    ./build_ubuntu.sh jupyterlab {image_name} {base_image} \
+                    TENSORFLOW_VERSION="{framework_version}" \
+                    CONDA_URL="{conda_url}" \
+                    2>&1 | tee ../{args.log_dir}/{image_name}/build.log
+                    """
+
+            # 打印构建命令（用于调试）
+            logger.info(build_command)
+
+            # 执行构建命令，捕获异常
+            try:
+                logger.info(f"==== 镜像 {image_name} 开始构建  ====")
+                subprocess.run(build_command, shell=True, check=True)
+            except subprocess.CalledProcessError:
+                logger.info(f"==== 镜像 {image_name} 构建失败，跳过该镜像 ====")
+                continue  # 继续执行下一个镜像
+
+            # 创建与镜像名称对应的文件夹，用于保存测试结果
+            image_result_dir = os.path.join(args.log_dir, image_name)
+
+            # 执行测试并将日志保存到相应的目录
+            test_commands = [
+                f"sh script/1_base_test.sh {image_name} > {image_result_dir}/1_base_test.log 2>&1",
+                f"sh script/2_text_test.sh {image_name} > {image_result_dir}/2_text_test.log 2>&1",
+                f"sh script/3_image_test.sh {image_name} > {image_result_dir}/3_image_test.log 2>&1",
+            ]
+            if "pytorch" in image_name:
+                test_commands.append(
+                    f"mv gpu-base-image-test/pytorch/stable-diffusion-v1-4/output.png {image_result_dir}")
+
+            # # 执行测试命令
+            # for test_command in test_commands:
+            #     logger.info(f"执行测试: {test_command}")
+            #     subprocess.run(test_command, shell=True)
+
+            # 生成打包后的镜像文件名，替换 ":" 为 "-" 并添加 ".tar" 后缀
+            tar_file = f"{image_name.replace(':', '-')}.tar"
+
+            if not args.no_save_trans:
+                # 提交打包和传输任务到后台线程池，继续执行下一个构建任务
+                executor.submit(package_and_transfer, image_name, tar_file, image_result_dir, logger)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Autobuild images from a excel file.')
+    parser.add_argument('--input-file', type=str, default="input.xlsx", required=True,
+                        help='a excel file with images to build.')
+    parser.add_argument('--index', type=str,
+                        help='the indexes for images to build, separated by ","')
+    parser.add_argument('--num', type=int,
+                        help='the number of images to build')
+    parser.add_argument('--log-dir', type=str, default="logs",
+                        help='logs directory')
+    parser.add_argument('--ok-file', type=str, default="ok.txt",
+                        help='the file of succeed images')
+    parser.add_argument('--trans-retry-max-num', type=int, default=3,
+                        help='transform retry max num')
+    parser.add_argument('--trans-retry-delay', type=int, default=5,
+                        help='transform delay seconds')
+    parser.add_argument('--des-path', type=str,
+                        default="openaimodels@cancon.hpccube.com:/public/home/openaimodels/chenyh/",
+                        help='destination path in scnet')
+    parser.add_argument("--no-save-trans", action="store_true",
+                        help="do not save and transform image")
+    args = parser.parse_args()
+    run()
--- a/build_space/Dockerfile.jupyterlab_ubuntu
+++ b/build_space/Dockerfile.jupyterlab_ubuntu
@@ -14,6 +14,7 @@ ARG TORCHAUDIO_VERSION

 # ----- tensorflow args -----
 ARG TENSORFLOW_VERSION
+ARG IMAGE_TAG

 #ARG CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_24.7.1-0-Linux-x86_64.sh"
 # ----- paddlepaddle args -----
@@ -62,15 +63,6 @@ RUN if [ -n "$CONDA_URL" ];then \
    && cd .. \
    && rm -rf /tmp/conda-extension; fi

-#RUN if [ $BASE_IMAGE_IS_TORCH -eq 0 ];then \
-#    mkdir -p /tmp/conda-extension \
-#    && cd /tmp/conda-extension \
-#    && wget $CONDA_URL \
-#    && bash $(echo $CONDA_URL | awk -F "/" '{print $NF}') -b -p /opt/conda \
-#    && echo "export PATH=\$PATH:/opt/conda/bin" >> /etc/profile.d/sothisai.sh \
-#    && cd .. \
-#    && rm -rf /tmp/conda-extension; fi
-
 ENV PATH=$PATH:/opt/conda/bin

 RUN pip3 install --upgrade pip ${SOURCES} || pip install --upgrade pip ${SOURCES} \
@@ -92,11 +84,23 @@ RUN if [ -n "$TORCH_VERSION" ];then \
    pip install --no-cache-dir transformers accelerate diffusers; fi

 RUN if [ -n "$TENSORFLOW_VERSION" ]; then \
-    tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ) && \
-    pip install --no-cache-dir tensorflow[and-cuda]==$TENSORFLOW_VERSION \
-    tensorflow-text==$tf_version_minor.* tf-models-official==$tf_version_minor.* && \
-    apt-get update -y && \
-    apt-get install --no-install-recommends -y libnvinfer8 libnvjitlink-12-3 libnvjpeg-12-3 libnvinfer-plugin8; fi
+    tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ); \
+    [ "$tf_version_minor" == "2.13" ] || [ "$tf_version_minor" == "2.18" ] && tensorflow_text_version=$tf_version_minor.0rc0 || tensorflow_text_version=$tf_version_minor.*; \
+    pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION  -i https://pypi.org/simple/ && \
+    pip install --no-cache-dir tensorflow-text==$tensorflow_text_version tensorflow-hub; fi
+
+    # 2.16.1必须手动添加环境变量
+RUN if [ $TENSORFLOW_VERSION == "2.16.1" ]; then \
+    python_version=$(echo $IMAGE_TAG | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}') && \
+    CUDNN_PATH=/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn && \
+    echo "export CUDNN_PATH=$CUDNN_PATH" >> /etc/bash.bashrc && \
+    echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDNN_PATH/lib:/usr/local/cuda/lib64" >> /etc/bash.bashrc;fi && \
+    if [  $TENSORFLOW_VERSION == "2.8.0" ] || [  $TENSORFLOW_VERSION == "2.7.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
+        pip install --no-cache-dir protobuf==3.20.*;fi && \
+    if [  $TENSORFLOW_VERSION == "2.4.0" ] || [  $TENSORFLOW_VERSION == "2.5.0" ] || [  $TENSORFLOW_VERSION == "2.6.0" ]; then \
+        pip install --no-cache-dir numpy==1.19.2 matplotlib==3.6.*;fi && \
+    if [  $TENSORFLOW_VERSION == "2.8.0" ] || [  $TENSORFLOW_VERSION == "2.10.0" ] || [  $TENSORFLOW_VERSION == "2.11.0" ] || [  $TENSORFLOW_VERSION == "2.9.0" ] || [  $TENSORFLOW_VERSION == "2.9.3" ] || [  $TENSORFLOW_VERSION == "2.14.0" ]; then \
+        pip install --no-cache-dir "numpy<2"; fi

 # ----- paddlepaddle install -----
 RUN if [ -n "$PADDLEPADDLE_VERSION" ] && [ -n "$PADDLE_URL" ]; then \
@@ -114,7 +118,6 @@ RUN if [ -n "$PADDLENLP_VERSION" ] ; then \
    pip install --upgrade ppdiffusers --no-deps  && rm -r /root/.cache/pip; \
 fi

-
 COPY ./python-requirements.txt /tmp/
 RUN pip install --no-cache-dir -r /tmp/python-requirements.txt

@@ -150,7 +153,6 @@ RUN jupytersite="$(python3 -m pip show jupyterlab | grep -i '^location' | awk '{
    && ssh-keygen  -A \
    && sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config

-
 EXPOSE 8888



--- a/build_space/build_ubuntu.sh
+++ b/build_space/build_ubuntu.sh
@@ -9,9 +9,10 @@ build_args=" --build-arg BASE_IMAGE=$base_image"
 if [ ${base_image%%:*} = "pytorch/pytorch" ]; then
    build_args="$build_args --build-arg BASE_IMAGE_IS_TORCH=1 "
 fi
+build_args="$build_args --build-arg IMAGE_TAG=$image_tag"
 for arg in ${*:4}
 do
-    build_args="$build_args --build-arg $arg "
+    build_args="$build_args --build-arg $arg"
 done

 tmp_dockerfile="Dockerfile.${RANDOM}"

--- a/conf/__init__.py
+++ b/conf/__init__.py
--- a/conf/__pycache__/__init__.cpython-310.pyc
+++ b/conf/__pycache__/__init__.cpython-310.pyc
--- a/conf/__pycache__/config.cpython-310.pyc
+++ b/conf/__pycache__/config.cpython-310.pyc
--- a/conf/config.py
+++ b/conf/config.py
+TORCH_CUDNN_CONFIG = {
+    "12": "9",
+    "11": "8",
+    "10": "7",
+    "9": "7"
+}
+
+# cuda大版本（12.4）：cuda详细版本（12.4.1），cudnn版本（有可能为空）
+NVIDIA_CUDA_CUDNN_VERSION_CONFIG = {
+    "12.4": ["12.4.1", ""],
+    "12.1": ["12.1.0", "8"]
+
+}
--- a/get_args.py
+++ b/get_args.py
+import pandas as pd
+import argparse
+from conf import config
+
+# nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+# pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
+BASE_NVIDIA_IMAGE_TAG = "nvidia/cuda:{cuda_version}-cudnn{cudnn_version}-{tag}-{op_system}"
+BASE_TORCH_IMAGE_TAG = "pytorch/pytorch:{torch_version}-cuda{cuda_version}-cudnn{cudnn_version}-{tag}"
+
+
+def generate():
+    data = pd.read_csv(args.input_csv)
+
+    for index, row in data.iterrows():
+        op_system = row["操作系统"]
+        cuda_version = row["Runtime版本"].replace("cuda", "")
+        cudnn_version = config.CUDNN_CONFIG[cuda_version.split(".")[0]]
+        torch_version = row["框架版本"]
+        python_version = row["Python版本"]
+        if args.devel_image:
+            tag = "devel"
+        else:
+            tag = "runtime"
+        if args.base_image_from == "nvidia":
+            base_image_tag = BASE_NVIDIA_IMAGE_TAG.format(cuda_version=cuda_version,
+                                                          cudnn_version=cudnn_version,
+                                                          tag=tag,
+                                                          op_system=op_system)
+        else:
+            base_image_tag = BASE_TORCH_IMAGE_TAG.format(cuda_version=cuda_version,
+                                                         cudnn_version=cudnn_version,
+                                                         tag=tag,
+                                                         torch_version=torch_version)
+        print(base_image_tag)
+        break
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate docker build args.')
+
+
+    parser.add_argument('--input-csv', type=str, default="AI内容协作表_GPU基础镜像(聂释隆).csv",
+                        help='input csv file path')
+    parser.add_argument('--base-image-from', type=str, default="nvidia", choices=["nvidia", "torch"],
+                        help='choice base image from nvidia or torch')
+    parser.add_argument('--devel-image', action='store_true', default=False,
+                        help='build devel image')
+
+    args = parser.parse_args()
+    generate()
--- a/get_image_tag.py
+++ b/get_image_tag.py
+import requests
+
+
+def get_docker_hub_tags(repository, username=None, token=None):
+    # 注意：这里的URL是虚构的，用于示例。你需要替换为Docker Hub API的实际URL。
+    # 如果API需要认证，你可能需要将用户名和令牌作为请求的一部分发送。
+    api_url = f"https://registry-1.docker.io/v2/{username or ''}{repository}/tags/list"
+    headers = {
+        'Authorization': f'Bearer {token}' if token else '',
+        'Accept': 'application/vnd.docker.distribution.manifest.v2+json'
+    }
+
+    try:
+        response = requests.get(api_url, headers=headers)
+        response.raise_for_status()  # 如果响应状态码不是200，则抛出HTTPError异常
+
+        # 解析响应体，这里假设返回的是JSON格式，并包含tags列表
+        tags = response.json().get('tags', [])
+        return tags
+    except requests.RequestException as e:
+        print(f"Error fetching tags: {e}")
+        return []
+
+    # 替换以下变量为你的Docker Hub用户名（如果有）、仓库名和（可选的）API令牌
+
+
+username = "your_username"  # 如果仓库是私有的或需要认证，则提供
+repository = "library/ubuntu"  # 例如，使用官方Ubuntu镜像
+token = "your_docker_hub_token"  # 如果API需要认证，则提供
+
+tags = get_docker_hub_tags(repository, username, token)
+print("Tags:", tags)
\ No newline at end of file
--- a/script/1_base_test.sh
+++ b/script/1_base_test.sh
@@ -24,7 +24,23 @@ if [[ "$1" == *"pytorch"* ]]; then
      print(\"torchaudio version: \", torchaudio.__version__);
      "
 elif [[ "$1" == *"tensorflow"* ]]; then
-  docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
+  tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
+  # 当tensorflow版本为2.16.1时，不添加环境变量找不到cuda，所以需要这样执行验证。在正常交互式启动容器时，会默认激活/etc/bash.bashrc，可以正常找到cuda
+  if [[ "$tensorflow_version" == "2.16.1" ]]; then
+  python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
+  docker run --rm --platform=linux/amd64 --gpus all \
+  -e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
+  -e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
+  $1 python -c "import os; \
+                os.system(\"cat /etc/issue\"); \
+                import sys; \
+                print(\"python version: \", sys.version); \
+                import tensorflow as tf; \
+                print(\"tensorflow version: \", tf.__version__); \
+                print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
+                os.system('nvcc -V | tail -n 2')
+                ";
+  else docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
      "import os; \
      os.system(\"cat /etc/issue\"); \
      import sys; \
@@ -33,7 +49,7 @@ elif [[ "$1" == *"tensorflow"* ]]; then
      print(\"tensorflow version: \", tf.__version__); \
      print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
      os.system('nvcc -V | tail -n 2')
-      "
+      "; fi
 elif [[ "$1" == *"paddle"* ]]; then
  TARGET_DIR=gpu-base-image-test/paddletest
  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python base_test.py
@@ -43,3 +59,4 @@ else
  exit 1
 fi

+
--- a/script/2_text_test.sh
+++ b/script/2_text_test.sh
@@ -9,9 +9,19 @@ fi
 if [[ "$1" == *"pytorch"* ]]; then 
  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/gpt2 $1 python infer.py; fi

-if [[ "$1" == *"tensorflow"* ]]; then 
-  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi
+if [[ "$1" == *"tensorflow"* ]]; then
+  tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
+  # 当tensorflow版本为2.16.1时，不添加环境变量找不到cuda，所以需要这样执行验证。在正常交互式启动容器时，会默认激活/etc/bash.bashrc，可以正常找到cuda
+  if [[ "$tensorflow_version" == "2.16.1" ]]; then
+    python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
+    docker run --rm --platform=linux/amd64 --gpus all \
+    -e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
+    -e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
+    -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py
+  else
+    docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi; fi
 if [[ "$1" == *"paddle"* ]]; then 
   TARGET_DIR=gpu-base-image-test/paddletest 
   docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python text.py; fi

+
--- a/script/3_image_test.sh
+++ b/script/3_image_test.sh
@@ -9,10 +9,20 @@ fi
 if [[ "$1" == *"pytorch"* ]]; then 
  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/stable-diffusion-v1-4 $1 python infer.py; fi

-if [[ "$1" == *"tensorflow"* ]]; then 
-  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi
+if [[ "$1" == *"tensorflow"* ]]; then
+  tensorflow_version=$(echo "$1" | cut -d: -f2 | cut -d- -f1)
+  # 当tensorflow版本为2.16.1时，不添加环境变量找不到cuda，所以需要这样执行验证。在正常交互式启动容器时，会默认激活/etc/bash.bashrc，可以正常找到cuda
+  if [[ "$tensorflow_version" == "2.16.1" ]]; then
+    python_version=$(echo $1 | awk -F'[-:]' '{for(i=3;i<=NF;i++) if($i ~ /^py[0-9]+\.[0-9]+$/) {gsub(/^py/,"",$i); print $i; exit}}')
+    docker run --rm --platform=linux/amd64 --gpus all \
+    -e CUDNN_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn" \
+    -e LD_LIBRARY_PATH="/opt/conda/lib/python$python_version/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64" \
+    -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py
+  else
+    docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi; fi 

 if [[ "$1" == *"paddle"* ]]; then 
  TARGET_DIR=gpu-base-image-test/paddletest 
  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace $1 python image.py; fi

+