"scripts/vscode:/vscode.git/clone" did not exist on "c8ce177895ee1cf2c36d2ef4ca29356b8b545c4d"
Unverified Commit 1d2d054b authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Dockerfile - upgrade nccl version and install ucx to fix bug in cuda 12.4 docker file (#646)

**Description**
fix bug in cuda 12.4 docker file


**Major Revision**
- upgrade nccl due to OOM bug in nccl v2.20 graph mode
- install ucx 1.16 for mutli thread support for mpi in ib-traffic
parent f42835aa
...@@ -8,7 +8,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 ...@@ -8,7 +8,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3
# - CUDA: 12.4.0 # - CUDA: 12.4.0
# - cuDNN: 9.0.0.306 # - cuDNN: 9.0.0.306
# - cuBLAS: 12.4.2.65 # - cuBLAS: 12.4.2.65
# - NCCL: v2.20 # - NCCL: v2.23.4-1
# - TransformerEngine 1.4 # - TransformerEngine 1.4
# Mellanox: # Mellanox:
# - OFED: 23.07-0.5.1.2 # - OFED: 23.07-0.5.1.2
...@@ -115,6 +115,23 @@ RUN cd /tmp && \ ...@@ -115,6 +115,23 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \ mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz rm -rf aocl-blis-linux-aocc-4.0.tar.gz
# Install NCCL 2.23.4
RUN cd /tmp && \
git clone -b v2.23.4-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j ${NUM_MAKE_JOBS} src.build && \
make install && \
rm -rf /tmp/nccl
# Install UCX v1.16.0 with multi-threading support
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v1.16.0/ucx-1.16.0.tar.gz && \
tar xzf ucx-1.16.0.tar.gz && \
cd ucx-1.16.0 && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install
ENV PATH="${PATH}" \ ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \ SB_HOME=/opt/superbench \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment