Unverified Commit 433785fd authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Add Feature - Add GDR-only nccl-tests for Nvidia machines (#299)

This commit adds GDR-only nccl-tests for Nvidia machines. Also bump NCCL to v2.10.3-1 to achieve peak performance in this test.
parent 682b2c12
......@@ -103,6 +103,8 @@ jobs:
tags: ${{ steps.metadata.outputs.tags }}
cache-from: ${{ steps.metadata.outputs.cache_from }}
cache-to: ${{ steps.metadata.outputs.cache_to }}
build-args: |
NUM_MAKE_JOBS=8
labels: |
org.opencontainers.image.source=${{ github.event.repository.html_url }}
org.opencontainers.image.created=${{ github.event.repository.pushed_at }}
......
......@@ -7,7 +7,7 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
# NVIDIA:
# - CUDA: 11.1.1
# - cuDNN: 8.0.5
# - NCCL: bootstrap_tag
# - NCCL: v2.10.3-1
# Mellanox:
# - OFED: 5.2-2.2.3.0
# - HPC-X: v2.8.3
......@@ -46,6 +46,8 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64
ARG NUM_MAKE_JOBS=
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \
......@@ -85,16 +87,16 @@ RUN cd /tmp && \
git reset --hard 7cccbc1 && \
./autogen.sh && \
./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \
make -j && \
make -j ${NUM_MAKE_JOBS} && \
make install && \
cd /tmp && \
rm -rf nccl-rdma-sharp-plugins
# Install NCCL patch
RUN cd /tmp && \
git clone -b bootstrap_tag https://github.com/NVIDIA/nccl.git && \
git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \
cd nccl && \
make -j src.build && \
make -j ${NUM_MAKE_JOBS} src.build && \
make install && \
cd /tmp && \
rm -rf nccl
......@@ -117,7 +119,7 @@ ENV PATH="${PATH}" \
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make -j -C third_party cuda
RUN make -j ${NUM_MAKE_JOBS} -C third_party cuda
ADD . .
RUN python3 -m pip install .[nvidia,torch,ort] && \
......
......@@ -43,7 +43,7 @@ superbench:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
......@@ -51,6 +51,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:
......
......@@ -39,7 +39,7 @@ superbench:
<<: *default_local_mode
gemm-flops:
<<: *default_local_mode
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
......@@ -47,6 +47,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:
......
......@@ -33,7 +33,7 @@ superbench:
model_action:
- train
benchmarks:
nccl-bw:
nccl-bw:default:
enable: true
modes:
- name: local
......@@ -41,6 +41,21 @@ superbench:
parallel: no
parameters:
ngpus: 8
nccl-bw:gdr-only:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
env:
NCCL_IB_PCI_RELAXED_ORDERING: '1'
NCCL_NET_GDR_LEVEL: '5'
NCCL_P2P_DISABLE: '1'
NCCL_SHM_DISABLE: '1'
NCCL_MIN_NCHANNELS: '16'
NCCL_IB_DISABLE: '0'
parameters:
ngpus: 8
ib-loopback:
enable: true
modes:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment