Unverified Commit 32ed692e authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to...

Benchmarks: Build Pipeline - fix nccl and nccl test version to 2.18.3 to resolve hang issue in cuda12.2 docker (#599)

**Description**
Upgrade nccl tests to resolve all gather hang issue in cuda12.2 docker.
parent 885b5f51
......@@ -88,11 +88,11 @@ jobs:
CACHE_TO="type=inline,mode=max"
fi
echo ::set-output name=dockerfile::${DOCKERFILE}
echo ::set-output name=build_args::${BUILD_ARGS}
echo ::set-output name=tags::${TAGS}
echo ::set-output name=cache_from::${CACHE_FROM}
echo ::set-output name=cache_to::${CACHE_TO}
echo "dockerfile=${DOCKERFILE}" >> "$GITHUB_OUTPUT"
echo "build_args=${BUILD_ARGS}" >> "$GITHUB_OUTPUT"
echo "tags=${TAGS}" >> "$GITHUB_OUTPUT"
echo "cache_from=${CACHE_FROM}" >> "$GITHUB_OUTPUT"
echo "cache_to=${CACHE_TO}" >> "$GITHUB_OUTPUT"
- name: Echo build args
run: echo ${{ steps.metadata.outputs.build_args }}
- name: Echo image tag
......@@ -107,6 +107,9 @@ jobs:
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Pull cache image
run: sudo docker pull ${{ steps.metadata.outputs.tags }}
continue-on-error: true
- name: Login to the GitHub Container Registry
uses: docker/login-action@v1
if: ${{ github.event_name == 'release' }}
......
......@@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:23.10-py3
# NVIDIA:
# - CUDA: 12.2.2
# - cuDNN: 8.9.5
# - NCCL: v2.19.3-1
# - NCCL: v2.18.3-1
# Mellanox:
# - OFED: 23.07-0.5.1.2
# - HPC-X: v2.16
......@@ -113,6 +113,13 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz
# Install NCCL 2.18.3
RUN cd /tmp && \
git clone -b v2.18.3-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j src.build && \
make install && \
rm -rf /tmp/nccl
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
......
......@@ -62,7 +62,7 @@ endif
cuda_nccl_tests: sb_micro_path
ifneq (,$(wildcard nccl-tests/Makefile))
cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
cp -v ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
cp -v -r ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif
# Build perftest.
......
Subproject commit 8274cb47b6dc70ce4411e7f114b77173d3892414
Subproject commit 1292b25553bd0384f2faa2965f9d82b99797a348
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment