Unverified Commit 6ef3a011 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Add MSCCL Support for Nvidia GPU (#584)

**Description**
Add MSCCL support for Nvidia GPU
parent dd5a6329
...@@ -54,7 +54,7 @@ jobs: ...@@ -54,7 +54,7 @@ jobs:
- name: Checkout - name: Checkout
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
submodules: true submodules: recursive
- name: Free disk space - name: Free disk space
run: | run: |
mkdir /tmp/emptydir mkdir /tmp/emptydir
......
...@@ -21,3 +21,6 @@ ...@@ -21,3 +21,6 @@
[submodule "third_party/gpu-burn"] [submodule "third_party/gpu-burn"]
path = third_party/gpu-burn path = third_party/gpu-burn
url = https://github.com/wilicc/gpu-burn.git url = https://github.com/wilicc/gpu-burn.git
[submodule "third_party/msccl"]
path = third_party/msccl
url = https://github.com/Azure/msccl
...@@ -35,6 +35,7 @@ RUN apt-get update && \ ...@@ -35,6 +35,7 @@ RUN apt-get update && \
libavutil-dev \ libavutil-dev \
libboost-program-options-dev \ libboost-program-options-dev \
libcap2 \ libcap2 \
libcurl4-openssl-dev \
libnuma-dev \ libnuma-dev \
libpci-dev \ libpci-dev \
libswresample-dev \ libswresample-dev \
...@@ -43,6 +44,7 @@ RUN apt-get update && \ ...@@ -43,6 +44,7 @@ RUN apt-get update && \
lshw \ lshw \
python3-mpi4py \ python3-mpi4py \
net-tools \ net-tools \
nlohmann-json3-dev \
openssh-client \ openssh-client \
openssh-server \ openssh-server \
pciutils \ pciutils \
...@@ -129,7 +131,7 @@ ADD dockerfile/etc /opt/microsoft/ ...@@ -129,7 +131,7 @@ ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME} WORKDIR ${SB_HOME}
ADD third_party third_party ADD third_party third_party
RUN make -C third_party cuda RUN make -C third_party cuda_with_msccl
ADD . . ADD . .
RUN python3 -m pip install --upgrade setuptools==65.7 && \ RUN python3 -m pip install --upgrade setuptools==65.7 && \
......
...@@ -11,10 +11,11 @@ HPCX_HOME ?= /opt/hpcx ...@@ -11,10 +11,11 @@ HPCX_HOME ?= /opt/hpcx
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
# Build all targets. # Build all targets.
all: cuda rocm all: cuda rocm
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
cpu: common cpu_perftest cpu: common cpu_perftest
...@@ -188,3 +189,26 @@ megatron_deepspeed: ...@@ -188,3 +189,26 @@ megatron_deepspeed:
cd Megatron && \ cd Megatron && \
python -m pip install -r requirements.txt && \ python -m pip install -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
cd ./msccl/executor/msccl-executor-nccl && \
make -j4 src.build && \
cd ../../..
mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
endif
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
cd ./msccl/scheduler/msccl-scheduler && \
CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j4 && \
cd ../../..
mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
endif
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
cd ./msccl/tests/msccl-tests-nccl && \
make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl -j4 && cd ../../..
mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
endif
Subproject commit 7d4beb8c0ba5b6c534c524023e57fe0467dc591c
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment