feat: tensorflow initial push

222452e0 · chenpangpang · 33cf7c0a · 222452e0 · 222452e0 · 222452e0
Commit 222452e0 authored Oct 11, 2024 by chenpangpang
6 changed files
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 1. 准备一台裸机器，安装[nvidia-docker2](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)、git
 2. 下载镜像验证中需要的代码和模型（或从陈宜航处拷贝），放在项目根目录下
   1. 下载代码：`git clone http://developer.hpccube.com/codes/chenpangpang/gpu-base-image-test.git`
-   2. 下载模型: `cd gpu-base-image-test && python hf_down.py`
+   2. 下载模型(pytorch): `cd gpu-base-image-test/pytorch && python hf_down.py`
 3. 确认要构建的镜像
   - 镜像制作进度：https://bvjoh3z2qoz.feishu.cn/base/BKl6birVbarmzJsnznkcEDFTnV9?table=tbl3bCdS7qfjPn6j&view=vewww0URg8
 ## 镜像构建
@@ -20,23 +20,39 @@
  - 参数2: 输出镜像名
  - 参数3: 基础镜像
 - 基于[nvidia官方镜像](https://hub.docker.com/r/nvidia/cuda)构建镜像
-    ```bash
+  - pytorch
-  cd build_space && \
+      ```bash
-  ./build_ubuntu.sh jupyterlab \
+    cd build_space && \
-                    juypterlab-pytorch:2.3.1-py3.8-cuda12.1-ubuntu22.04-devel \
+    ./build_ubuntu.sh jupyterlab \
-                    nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 \
+                      juypterlab-pytorch:2.3.1-py3.8-cuda12.1-ubuntu22.04-devel \
-                    TORCH_VERSION="2.3.1" \
+                      nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 \
-                    TORCHVISION_VERSION="0.18.1" \
+                      TORCH_VERSION="2.3.1" \
-                    TORCHAUDIO_VERSION="2.3.1" \
+                      TORCHVISION_VERSION="0.18.1" \
-                    CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh"
+                      TORCHAUDIO_VERSION="2.3.1" \
-  ```
+                      CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh"
-  - 参数1: ide，不需要改动
+    ```
-  - 参数2: 输出镜像名
+    - 参数1: ide，不需要改动
-  - 参数3: 基础镜像
+    - 参数2: 输出镜像名
-  - TORCH_VERSION：torch版本
+    - 参数3: 基础镜像
-  - TORCHVISION_VERSION：torchvision版本
+    - TORCH_VERSION：torch版本
-  - TORCHAUDIO_VERSION：torchaudio版本
+    - TORCHVISION_VERSION：torchvision版本
-  - CONDA_URL：安装conda的url
+    - TORCHAUDIO_VERSION：torchaudio版本
+    - CONDA_URL：安装conda的url
+  - tensorflow
+      ```bash
+    cd build_space && \
+    ./build_ubuntu.sh jupyterlab \
+                      jupyterlab-tensorflow:2.17.0-py3.11-cuda12.3-ubuntu22.04-devel \
+                      nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 \
+                      TENSORFLOW_VERSION="2.17.0" \
+                      CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py311_24.7.1-0-Linux-x86_64.sh"
+    ```
+    - 参数1: ide，不需要改动
+    - 参数2: 输出镜像名
+    - 参数3: 基础镜像
+    - TENSORFLOW_VERSION：tensorflow版本
+    - CONDA_URL：安装conda的url
 ### 相关链接
 - pytorch镜像(**选择devel镜像**)：https://hub.docker.com/r/pytorch/pytorch/tags

--- a/build_space/Dockerfile.jupyterlab_ubuntu
+++ b/build_space/Dockerfile.jupyterlab_ubuntu
@@ -4,11 +4,17 @@ FROM $BASE_IMAGE
 ARG BASE_IMAGE
 ARG DEBIAN_FRONTEND=noninteractive
 LABEL module="jupyter"
+# ----- torch args -----
 # 是否基于torch镜像构建
 ARG BASE_IMAGE_IS_TORCH=0
-ARG TORCH_VERSION="2.0.1"
+ARG TORCH_VERSION
-ARG TORCHVISION_VERSION="0.15.2"
+ARG TORCHVISION_VERSION
-ARG TORCHAUDIO_VERSION="2.0.2"
+ARG TORCHAUDIO_VERSION
+# ----- tensorflow args -----
+ARG TENSORFLOW_VERSION
 ARG CONDA_URL="https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py310_24.7.1-0-Linux-x86_64.sh"
 ARG SOURCES="-i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn"
 ENV TZ=Asia/Shanghai
@@ -54,11 +60,21 @@ RUN pip3 install --upgrade pip ${SOURCES} || pip install --upgrade pip ${SOURCES
    && mv /etc/apt/sources.list.bak /etc/apt/sources.list \
    && mv /etc/apt/sources.list.d.bak /etc/apt/sources.list.d
-RUN if [ $BASE_IMAGE_IS_TORCH -eq 0 ];then \
+RUN if [ $BASE_IMAGE_IS_TORCH -eq 0 && -n "$TORCH_VERSION" ];then \
    pip3 install torch==$TORCH_VERSION torchvision==$TORCHVISION_VERSION torchaudio==$TORCHAUDIO_VERSION \
    --index-url https://download.pytorch.org/whl/cu$(echo "$BASE_IMAGE" | awk -F'[:-]' '{n=split($2,a,"."); print a[1] a[2]}') \
    && rm -r /root/.cache/pip; fi
+RUN if [ -n "$TORCH_VERSION" ];then \
+    pip install --no-cache-dir transformers accelerate diffusers; fi
+RUN if [ -n "$TENSORFLOW_VERSION" ]; then \
+    tf_version_minor=$(echo $TENSORFLOW_VERSION | cut -d'.' -f1-2 ) && \
+    pip install --no-cache-dir tensorflow[and-cuda]==$TENSORFLOW_VERSION \
+    tensorflow-text==$tf_version_minor.* tf-models-official==$tf_version_minor.* && \
+    apt-get update -y && \
+    apt-get install --no-install-recommends -y libnvinfer8 libnvjitlink-12-3 libnvjpeg-12-3 libnvinfer-plugin8; fi
 COPY ./python-requirements.txt /tmp/
 RUN pip install --no-cache-dir -r /tmp/python-requirements.txt

--- a/build_space/python-requirements.txt
+++ b/build_space/python-requirements.txt
@@ -2,7 +2,4 @@ setuptools
 ipywidgets
 wheel
 matplotlib
-transformers
 git-lfs
-accelerate
-diffusers
\ No newline at end of file
--- a/script/1_base_test.sh
+++ b/script/1_base_test.sh
 #!/bin/bash
-docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
+# 检查是否提供了输入参数
+if [ -z "$1" ]; then
+  echo "please set input image"
+  exit 1
+fi
+# 检查第一个输入参数中是否包含"pytorch"字符串
+if [[ "$1" == *"pytorch"* ]]; then
+  docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
      "import os; \
      os.system(\"cat /etc/issue\"); \
      import sys; \
@@ -14,4 +22,23 @@ docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
      print(\"torchvision version: \", torchvision.__version__); \
      import torchaudio; \
      print(\"torchaudio version: \", torchaudio.__version__);
      "
\ No newline at end of file
+elif [[ "$1" == *"tensorflow"* ]]; then
+  docker run --rm --platform=linux/amd64 --gpus all $1  python -c \
+      "import os; \
+      os.system(\"cat /etc/issue\"); \
+      import sys; \
+      print(\"python version: \", sys.version); \
+      import tensorflow as tf; \
+      print(\"tensorflow version: \", tf.__version__); \
+      print(\"tensorflow cuda available: \", tf.test.is_gpu_available()); \
+      os.system('nvcc -V | tail -n 2')
+      "
+else
+  echo "ERROR: no supported test shell"
+  exit 1
+fi
--- a/script/2_text_generate_test.sh
+++ b/script/2_text_generate_test.sh
 #!/bin/bash
 TARGET_DIR=gpu-base-image-test
-docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/gpt2 $1 python infer.py
+# 检查是否提供了输入参数
\ No newline at end of file
+if [ -z "$1" ]; then
+  echo "please set input image"
+  exit 1
+fi
+if [[ "$1" == *"pytorch"* ]]; then \
+  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/gpt2 $1 python infer.py; fi
+if [[ "$1" == *"tensorflow"* ]]; then \
+  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/bert $1 python infer.py; fi
\ No newline at end of file
--- a/script/3_image_generate_test.sh
+++ b/script/3_image_generate_test.sh
 #!/bin/bash
 TARGET_DIR=gpu-base-image-test
-docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/stable-diffusion-v1-4 $1 python infer.py
+# 检查是否提供了输入参数
\ No newline at end of file
+if [ -z "$1" ]; then
+  echo "please set input image"
+  exit 1
+fi
+if [[ "$1" == *"pytorch"* ]]; then \
+  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/pytorch/stable-diffusion-v1-4 $1 python infer.py; fi
+if [[ "$1" == *"tensorflow"* ]]; then \
+  docker run --rm --platform=linux/amd64 --gpus all -v ./$TARGET_DIR:/workspace --workdir /workspace/tensorflow/mnist $1 python train.py; fi