Unverified Commit 1ad1c21c authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Dockerfile - Upgrade Docker image to CUDA 12.2 (#577)

Upgrade Docker image to CUDA 12.2 for H100:
* upgrade base image to 23.10
* fix onnxruntime version in python3.10
* fix compilation errors
parent 2235e084
...@@ -24,9 +24,9 @@ jobs: ...@@ -24,9 +24,9 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- name: cuda12.1 - name: cuda12.2
dockerfile: cuda12.1 dockerfile: cuda12.2
tags: superbench/main:cuda12.1 tags: superbench/main:cuda12.2
- name: cuda11.1.1 - name: cuda11.1.1
dockerfile: cuda11.1.1 dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest tags: superbench/main:cuda11.1.1,superbench/superbench:latest
......
FROM nvcr.io/nvidia/pytorch:23.03-py3 FROM nvcr.io/nvidia/pytorch:23.10-py3
# OS: # OS:
# - Ubuntu: 20.04 # - Ubuntu: 22.04
# - OpenMPI: 4.1.5a1 # - OpenMPI: 4.1.5rc2
# - Docker Client: 20.10.8 # - Docker Client: 20.10.8
# NVIDIA: # NVIDIA:
# - CUDA: 12.1.0 # - CUDA: 12.2.2
# - cuDNN: 8.8.1.3 # - cuDNN: 8.9.5
# - NCCL: v2.17.1-1 # - NCCL: v2.19.3-1
# Mellanox: # Mellanox:
# - OFED: 5.2-2.2.3.0 # TODO # - OFED: 23.07-0.5.1.2
# - HPC-X: v2.14 # - HPC-X: v2.16
# Intel: # Intel:
# - mlc: v3.10 # - mlc: v3.10
...@@ -74,20 +74,20 @@ RUN mkdir -p /root/.ssh && \ ...@@ -74,20 +74,20 @@ RUN mkdir -p /root/.ssh && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
# Install OFED # Install OFED
ENV OFED_VERSION=5.2-2.2.3.0 ENV OFED_VERSION=23.07-0.5.1.2
RUN cd /tmp && \ RUN cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X # Install HPC-X
ENV HPCX_VERSION=v2.14 ENV HPCX_VERSION=v2.16
RUN cd /opt && \ RUN cd /opt && \
rm -rf hpcx && \ rm -rf hpcx && \
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \ wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \ tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \ mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64 hpcx && \
rm hpcx.tbz rm hpcx.tbz
# Install Intel MLC # Install Intel MLC
...@@ -131,7 +131,8 @@ ADD third_party third_party ...@@ -131,7 +131,8 @@ ADD third_party third_party
RUN make -C third_party cuda RUN make -C third_party cuda
ADD . . ADD . .
RUN python3 -m pip install --no-cache-dir .[nvworker] && \ RUN python3 -m pip install --upgrade setuptools==65.7 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \ make cppbuild && \
make postinstall && \ make postinstall && \
rm -rf .git rm -rf .git
...@@ -213,7 +213,8 @@ def run(self): ...@@ -213,7 +213,8 @@ def run(self):
], ],
'ort': [ 'ort': [
'onnx>=1.10.2', 'onnx>=1.10.2',
'onnxruntime-gpu==1.10.0', 'onnxruntime-gpu==1.10.0; python_version<"3.10"',
'onnxruntime-gpu; python_version>="3.10"',
], ],
'nvidia': ['py3nvml>=0.2.6'], 'nvidia': ['py3nvml>=0.2.6'],
} }
......
...@@ -366,8 +366,8 @@ void CublasFunction::matrix_calculation_on_cpu_with_data(const T1 *Parameter_0_0 ...@@ -366,8 +366,8 @@ void CublasFunction::matrix_calculation_on_cpu_with_data(const T1 *Parameter_0_0
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
(*Result_cpu)[i + j * m + b * m * n] = beta * (T2)(Result_3_0_host[i + j * m + b * m * n]); (*Result_cpu)[i + j * m + b * m * n] = beta * (T2)(Result_3_0_host[i + j * m + b * m * n]);
for (int p = 0; p < k; p++) { for (int p = 0; p < k; p++) {
(*Result_cpu)[i + j * m + b * m * n] += (*Result_cpu)[i + j * m + b * m * n] += (T2)(Parameter_0_0_host_op[p * m + i + b * m * k] *
Parameter_0_0_host_op[p * m + i + b * m * k] * Parameter_1_0_host_op[j * k + p + b * k * n]; Parameter_1_0_host_op[j * k + p + b * k * n]);
(*Result_cpu)[i + j * m + b * m * n] *= alpha; (*Result_cpu)[i + j * m + b * m * n] *= alpha;
} }
} }
...@@ -444,7 +444,7 @@ int CublasFunction::check_result(int batch_count, T1 *Result_3_0, T2 *Result_cpu ...@@ -444,7 +444,7 @@ int CublasFunction::check_result(int batch_count, T1 *Result_3_0, T2 *Result_cpu
// |<x, y>_cpu - <x,y>_gpu|/|<x, y>_cpu|/dot_length < eps // |<x, y>_cpu - <x,y>_gpu|/|<x, y>_cpu|/dot_length < eps
int error_count = 0; int error_count = 0;
for (int i = 0; i < static_cast<int>(m * n) * batch_count; i++) { for (int i = 0; i < static_cast<int>(m * n) * batch_count; i++) {
double abs_err = fabs(Result_cpu[i] - Result_3_0_host[i]); double abs_err = fabs(Result_cpu[i] - (T2)(Result_3_0_host[i]));
double dot_length = k; double dot_length = k;
double abs_val = fabs(Result_cpu[i]); double abs_val = fabs(Result_cpu[i]);
double rel_err = abs_err / abs_val / dot_length; double rel_err = abs_err / abs_val / dot_length;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment