Dockerfile - Add CUDA11.8 Docker image for Nvidia arch90 GPUs (#449)

Add Docker image for arch90 NVIDIA GPUs: * add CUDA11.8 Dockerfile * update archs in Makefile and benchmarks accordingly * update image build pipeline

Dockerfile - Add CUDA11.8 Docker image for Nvidia arch90 GPUs (#449)
Add Docker image for arch90 NVIDIA GPUs: * add CUDA11.8 Dockerfile * update archs in Makefile and benchmarks accordingly * update image build pipeline
a3c65b2a · Yifan Xiong · GitHub · 7838b6b1 · a3c65b2a · a3c65b2a
Unverified Commit a3c65b2a authored Dec 29, 2022 by Yifan Xiong Committed by GitHub Dec 29, 2022
4 changed files
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -24,6 +24,9 @@ jobs:
    strategy:
      matrix:
        include:
+        - name: cuda11.8
+          dockerfile: cuda11.8
+          tags: superbench/main:cuda11.8
        - name: cuda11.1.1
          dockerfile: cuda11.1.1
          tags: superbench/main:cuda11.1.1,superbench/superbench:latest

--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -22,6 +22,7 @@ RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    autoconf \
    automake \
+    bc \
    build-essential \
    curl \
    dmidecode \

--- a/dockerfile/cuda11.8.dockerfile
+++ b/dockerfile/cuda11.8.dockerfile
+FROM nvcr.io/nvidia/pytorch:22.12-py3
+
+# OS:
+#   - Ubuntu: 20.04
+#   - OpenMPI: 4.1.5a1
+#   - Docker Client: 20.10.8
+# NVIDIA:
+#   - CUDA: 11.8.0
+#   - cuDNN: 8.7.0.84
+#   - NCCL: v2.15.5-1
+# Mellanox:
+#   - OFED: 5.2-2.2.3.0
+#   - HPC-X: v2.8.3
+# Intel:
+#   - mlc: v3.9a
+
+LABEL maintainer="SuperBench"
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    autoconf \
+    automake \
+    bc \
+    build-essential \
+    curl \
+    dmidecode \
+    git \
+    iproute2 \
+    jq \
+    libaio-dev \
+    libboost-program-options-dev \
+    libcap2 \
+    libnuma-dev \
+    libpci-dev \
+    libtinfo5 \
+    libtool \
+    lshw \
+    net-tools \
+    openssh-client \
+    openssh-server \
+    pciutils \
+    sudo \
+    util-linux \
+    vim \
+    wget \
+    && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+ARG NUM_MAKE_JOBS=
+
+# Install Docker
+ENV DOCKER_VERSION=20.10.8
+RUN cd /tmp && \
+    wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
+    tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
+    rm docker.tgz
+
+# Update system config
+RUN mkdir -p /root/.ssh && \
+    touch /root/.ssh/authorized_keys && \
+    mkdir -p /var/run/sshd && \
+    sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
+    sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
+    echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
+    echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
+
+# Install OFED
+ENV OFED_VERSION=5.2-2.2.3.0
+RUN cd /tmp && \
+    wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
+    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
+
+# Install HPC-X
+RUN cd /opt && \
+    rm -rf hpcx && \
+    wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
+    tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
+    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
+    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
+
+# Install Intel MLC
+RUN cd /tmp && \
+    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    tar xzf mlc.tgz Linux/mlc && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    rm -rf ./Linux mlc.tgz
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+
+# Add config files
+ADD dockerfile/etc /opt/microsoft/
+
+WORKDIR ${SB_HOME}
+
+ADD third_party third_party
+RUN make -C third_party cuda
+
+ADD . .
+RUN python3 -m pip install --no-cache-dir .[nvworker] && \
+    make cppbuild && \
+    make postinstall && \
+    rm -rf .git
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -6,9 +6,11 @@ SB_MICRO_PATH ?= /usr/local
 MPI_HOME ?= /usr/local/mpi
 HIP_HOME ?= /opt/rocm/hip
 RCCL_HOME ?= /opt/rocm/rccl
-ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
 HPCX_HOME ?= /opt/hpcx

+CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
+ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
+
 .PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn

 # Build all targets.
@@ -24,9 +26,14 @@ sb_micro_path:

 # Build cutlass.
 cuda_cutlass:
+ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+	$(eval ARCHS := "70;75;80;86;90")
+else
+	$(eval ARCHS := "70;75;80;86")
+endif
 ifneq (,$(wildcard cutlass/CMakeLists.txt))
 	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin -DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib -DCMAKE_BUILD_TYPE=Release \
-		-DCUTLASS_NVCC_ARCHS='70;75;80;86' -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
+		-DCUTLASS_NVCC_ARCHS=$(ARCHS) -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
 	cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install
 endif

@@ -35,10 +42,17 @@ endif
 # The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
 # The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
 cuda_bandwidthTest: sb_micro_path
+ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
+	$(eval ARCHS := "70 75 80 86 90")
+else
+	$(eval TEST_PATH := "./cuda-samples/Samples/bandwidthTest")
+	$(eval ARCHS := "70 75 80 86")
+endif
 	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
-	git clone -b v$(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) https://github.com/NVIDIA/cuda-samples.git ./cuda-samples
-	cd ./cuda-samples/Samples/bandwidthTest && make clean && make TARGET_ARCH=x86_64 SMS="70 75 80 86"
-	cp -v ./cuda-samples/Samples/bandwidthTest/bandwidthTest $(SB_MICRO_PATH)/bin/
+	git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
+	cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS)
+	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/

 # Build nccl-tests from commit 8274cb4 of default branch.
 cuda_nccl_tests: sb_micro_path