Makefile 17.1 KB
Newer Older
1
2
3
4
# Copyright (c) Microsoft Corporation - All rights reserved
# Licensed under the MIT License


5
6
7
8
SB_MICRO_PATH ?= /usr/local
MPI_HOME ?= /usr/local/mpi
HIP_HOME ?= /opt/rocm/hip
RCCL_HOME ?= /opt/rocm/rccl
9
HPCX_HOME ?= /opt/hpcx
10
ROCM_PATH ?= /opt/rocm
11

12
13
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
14
HIPBLASLT_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
15
ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
16

17
18
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)

19
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm rocm_hpcg rocm_hpl rocm_hpl_mxp
20

21
# Build targets.
22
all: cuda rocm
23
# msccl: api change in cudaStreamUpdateCaptureDependencies
24
cuda_with_msccl: cuda cuda_msccl
25
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
26
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
27
dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm rocm_hpcg rocm_hpl rocm_hpl_mxp
28
29
cpu: common cpu_perftest
common: fio cpu_stream
30
31
32

# non aarch64 specific targets
ifneq ($(shell uname -m), aarch64)
33
common: cpu_hpl
34
directx_amd: directx_amf_encoding_latency
35
endif
36

37
38
39
40
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
sb_micro_path:
	mkdir -p $(SB_MICRO_PATH)/bin
	mkdir -p $(SB_MICRO_PATH)/lib
41

42
# Build cutlass.
43
# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
44
cuda_cutlass:
45
46
47
48
49
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	$(eval ARCHS := "100;103")
	if [ -d cutlass ]; then rm -rf cutlass; fi
	git clone --branch v4.1.0 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
50
	$(eval ARCHS := "90;100")
51
	if [ -d cutlass ]; then rm -rf cutlass; fi
52
	git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
53
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
54
	$(eval ARCHS := "70;75;80;86;89;90")
55
56
57
else
	$(eval ARCHS := "70;75;80;86")
endif
58

59
ifneq (,$(wildcard cutlass/CMakeLists.txt))
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
	cmake -DCMAKE_INSTALL_BINDIR=$(SB_MICRO_PATH)/bin \
		-DCMAKE_INSTALL_LIBDIR=$(SB_MICRO_PATH)/lib \
		-DCMAKE_BUILD_TYPE=Release \
		-DCUTLASS_NVCC_ARCHS=$(ARCHS) \
		-DCUTLASS_ENABLE_EXAMPLES=OFF \
		-DCUTLASS_ENABLE_TESTS=OFF \
		-S ./cutlass \
		-B ./cutlass/build \
		-DCUTLASS_LIBRARY_KERNELS="cutlass_simt_dgemm_128x128_8x2_*,\
cutlass_simt_sgemm_128x128_8x2_*,\
cutlass_simt_hgemm_256x128_8x2_*,\
cutlass_tensorop_h884gemm_256x128_32x2_*,\
cutlass_tensorop_d884gemm_128x128_16x3_*,\
cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*,\
cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*,\
cutlass_tensorop_h16816gemm_256x128_32x3_*,\
cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*,\
cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*"
78
	cmake --build ./cutlass/build -j $(shell nproc --ignore=2) --target install
79
	rm -rf ./cutlass/build
80
endif
81

82
83
84
85
# Build cuda-samples/Samples/bandwidthTest.
# cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
86
# The bandwidthTest sample was out-of-date and has been removed as of the CUDA Samples 12.9 release (see the change log). For up-to-date bandwidth measurements, refer instead to the NVBandwith utility.
87
cuda_bandwidthTest: sb_micro_path
88
89
	if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
	git clone --depth 1 -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
90
91
92
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	@echo "Skip cuda-samples build for CUDA>=12.9"
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
93
94
95
96
	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
	cd ./$(TEST_PATH) && mkdir build && cd build && cmake .. && make
	cp -v ./$(TEST_PATH)/build/bandwidthTest $(SB_MICRO_PATH)/bin/
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
97
98
	$(eval TEST_PATH := "./cuda-samples/Samples/1_Utilities/bandwidthTest")
	$(eval ARCHS := "70 75 80 86 90")
99
100
	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
101
102
103
else
	$(eval TEST_PATH := "./cuda-samples/Samples/bandwidthTest")
	$(eval ARCHS := "70 75 80 86")
104
	cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
105
	cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
106
endif
107

108
109
# Build nccl-tests.
# The version we use is the tag v2.13.13
110
cuda_nccl_tests: sb_micro_path
111
ifneq (,$(wildcard nccl-tests/Makefile))
112
	cd ./nccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
113
	cp -v -r ./nccl-tests/build/* $(SB_MICRO_PATH)/bin/
114
endif
115
116

# Build perftest.
one's avatar
one committed
117
# The commit we use is ea1c778782df3ec09b5f8101017fc0140b51a63d.
118
cuda_perftest:
119
120
121
ifneq (,$(wildcard perftest/autogen.sh))
	cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
endif
122
123
rocm_perftest:
ifneq (,$(wildcard perftest/autogen.sh))
one's avatar
one committed
124
	cd perftest && ./autogen.sh && ./configure --enable-rocm --with-rocm=$(ROCM_PATH) --prefix=$(SB_MICRO_PATH) && make -j && make install
125
endif
126
127
128
129
cpu_perftest:
ifneq (,$(wildcard perftest/autogen.sh))
	cd perftest && ./autogen.sh && ./configure --prefix=$(SB_MICRO_PATH) && make -j && make install
endif
130

one's avatar
one committed
131
# Build FIO from commit ed675d347 (fio-3.41 tag).
132
133
fio:
ifneq (,$(wildcard fio/Makefile))
134
	cd ./fio && ./configure --prefix=$(SB_MICRO_PATH) --disable-native && make -j && make install
135
endif
136

one's avatar
one committed
137
# Build rccl-tests from commit 66e513c of default branch.
138
rocm_rccl_tests: sb_micro_path
one's avatar
one committed
139
140
141
142
143
144
145
146
ifneq (, $(wildcard rccl-tests/install.sh))
	cd ./rccl-tests && \
	ln -sf $$(which hipify-perl) $(ROCM_PATH)/bin/hipify-perl || true && \
	./install.sh --mpi --mpi_home $(MPI_HOME) \
	--rocm_home $(ROCM_PATH) \
	--rccl_home $(RCCL_HOME) \
	--hip_compiler hipcc \
	--gpu_targets $$(paste -sd ',' $(ROCM_PATH)/bin/target.lst)
147
	cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
148
endif
149
150
151
152
153
154

# Build rocblas-bench.
# RocBLAS is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
# Since it takes several hours to build, avoid to build again if rocblas-bench exsists.
rocm_rocblas: sb_micro_path
155
156
157
158
159
160
161
	@if [ ! -e $(SB_MICRO_PATH)/bin/rocblas-bench ] && [ -z `which rocblas-bench` ]; then \
		if [ -d rocBLAS ]; then rm -rf rocBLAS; fi; \
		git clone -b ${ROCBLAS_BRANCH} https://github.com/ROCmSoftwarePlatform/rocBLAS.git ./rocBLAS; \
		sed -i 's|#include "gemm.hpp"|#include "Tensile/gemm.hpp"|' rocBLAS/clients/benchmarks/../../library/src/blas3/rocblas_trtri.hpp; \
		cd ./rocBLAS && ./install.sh --dependencies --clients-only; \
		cp -v $(SB_MICRO_PATH)/third_party/rocBLAS/build/release/clients/staging/rocblas-bench $(SB_MICRO_PATH)/bin/; \
	fi
162

163
164
165
166
167
168
169
# Build hipblaslt-bench.
# hipBLASLt is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
# Since it takes several hours to build, avoid to build again if hipblaslt-bench exsists.
rocm_hipblaslt: sb_micro_path
	@if [ ! -e $(SB_MICRO_PATH)/bin/hipblaslt-bench ] && [ -z `which hipblaslt-bench` ]; then \
		if [ -d hipBLASLt ]; then rm -rf hipBLASLt; fi; \
170
		git clone -b ${HIPBLASLT_BRANCH} https://github.com/ROCmSoftwarePlatform/hipBLASLt.git ./hipBLASLt; \
171
		cd ./hipBLASLt && ./install.sh -dc; \
172
		cp -v $(SB_MICRO_PATH)/third_party/hipBLASLt/build/release/clients/staging/hipblaslt-bench $(SB_MICRO_PATH)/bin/;  \
173
174
	fi

175
176
177
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.
178
rocm_bandwidthTest: sb_micro_path
179
180
181
	git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
	cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
	cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
182

183
184
185
186
187
188
189
190
191
192
193
194
# Build BabelStream hip-stream from submodule tag v5.0.
rocm_babelstream_hip: sb_micro_path
ifneq (,$(wildcard BabelStream/CMakeLists.txt))
	cd ./BabelStream && \
	cmake -S . -B build \
		-DMODEL=hip \
		-DCMAKE_CXX_COMPILER=hipcc \
		-DCXX_EXTRA_FLAGS="--gpu-max-threads-per-block=1024" && \
	cmake --build build -j $(NUM_MAKE_JOBS)
	cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
endif

one's avatar
one committed
195
196
197
198
199
200
# Build rocHPCG and stage helper scripts for SuperBench DTK image.
rocm_hpcg: sb_micro_path
ifneq (,$(wildcard rocHPCG/install.sh))
	cd ./rocHPCG && \
	./install.sh --with-rocm=$(ROCM_PATH) --with-mpi=$(MPI_HOME) --gpu-aware-mpi=on
	cp -v ./rocHPCG/build/release/rochpcg-install/bin/rochpcg $(SB_MICRO_PATH)/bin/
201
	cp -v ./run_rochpcg.sh $(SB_MICRO_PATH)/bin/run_rochpcg
one's avatar
one committed
202
203
204
	chmod +x $(SB_MICRO_PATH)/bin/rochpcg $(SB_MICRO_PATH)/bin/run_rochpcg
endif

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# Build rocHPL and stage the binary for SuperBench DTK image.
rocm_hpl: sb_micro_path
ifneq (,$(wildcard rocHPL/install.sh))
	cd ./rocHPL && \
	git apply ../rochpl_dtk26.patch && \
	./install.sh --with-rocm=$(ROCM_PATH) \
		--with-rocblas=$(ROCM_PATH)/rocblas \
		--with-mpi=$(MPI_HOME) \
		--arch=$$(paste -sd ',' $(ROCM_PATH)/bin/target.lst)
	cp -v ./rocHPL/build/bin/rochpl $(SB_MICRO_PATH)/bin/
	cp -v ./rocHPL/build/HPL.dat $(SB_MICRO_PATH)/bin/
	cp -v ./run_rochpl.sh $(SB_MICRO_PATH)/bin/run_rochpl
	chmod +x $(SB_MICRO_PATH)/bin/rochpl $(SB_MICRO_PATH)/bin/run_rochpl
endif

# Build rocHPL-MxP and stage the binary and run script for SuperBench DTK image.
rocm_hpl_mxp: sb_micro_path
ifneq (,$(wildcard rocHPL-MxP/install.sh))
	cd ./rocHPL-MxP && \
	git apply ../rochplmxp_dtk26.patch && \
	./install.sh --with-rocm=$(ROCM_PATH) \
		--with-rocblas=$(ROCM_PATH)/rocblas \
		--with-rocsolver=$(ROCM_PATH)/rocsolver \
		--with-mpi=$(MPI_HOME) \
		--arch=$$(paste -sd ',' $(ROCM_PATH)/bin/target.lst)
	cp -v ./rocHPL-MxP/build/bin/rochplmxp $(SB_MICRO_PATH)/bin/
	cp -v ./rocHPL-MxP/build/HPL-MxP.dat $(SB_MICRO_PATH)/bin/
	cp -v ./run_rochplmxp.sh $(SB_MICRO_PATH)/bin/run_rochplmxp
	chmod +x $(SB_MICRO_PATH)/bin/rochplmxp $(SB_MICRO_PATH)/bin/run_rochplmxp
endif

236
237
238
239
240
# Build GPCNET from commit c56fd9.
gpcnet: sb_micro_path
	bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
	cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/
	cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/
241

rafsalas19's avatar
rafsalas19 committed
242
# Build GPU burn from main branch (only branch that exists)
243
244
245
246
247
248
cuda_gpuburn: sb_micro_path
ifneq (,$(wildcard gpu-burn/Makefile))
	cd ./gpu-burn && make
	cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
	cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
endif
rafsalas19's avatar
rafsalas19 committed
249

250
# Build HPL from main branch
rafsalas19's avatar
rafsalas19 committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
cpu_hpl: sb_micro_path
ifneq (,$(wildcard hpl-tests/Makefile))
	cd ./hpl-tests && \
    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
	tar xzf hpl-2.3.tar.gz && \
	cp Make.Linux_zen3 hpl-2.3 && \
	cp Make.Linux_zen4 hpl-2.3 && \
	make all
	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen3/xhpl $(SB_MICRO_PATH)/bin/xhpl_z3
	cp -v ./hpl-tests/hpl-2.3/bin/Linux_zen4/xhpl $(SB_MICRO_PATH)/bin/xhpl_z4
	cp -v ./hpl-tests/hpl_run.sh $(SB_MICRO_PATH)/bin/
	cp -v ./hpl-tests/bindmem.sh $(SB_MICRO_PATH)/bin/
	cp -v ./hpl-tests/template_hpl.dat $(SB_MICRO_PATH)/bin/
endif

266
# Build STREAM
rafsalas19's avatar
rafsalas19 committed
267
268
269
270
271
cpu_stream: sb_micro_path
ifneq (,$(wildcard stream-tests/Makefile))
	cd ./stream-tests && \
    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
	make all
272
	cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
rafsalas19's avatar
rafsalas19 committed
273
endif
274
275
276
277
278
279
280
281

# Build AMD Encoder Latency Test
directx_amf_encoding_latency:
	@if not exist "AMF" (git clone -b v1.4.29 https://github.com/GPUOpen-LibrariesAndSDKs/AMF.git)
	@if exist "AMF\amf\public\samples\CPPSamples_vs2019.sln" ( \
		curl -L -o vs_buildtools.exe https://aka.ms/vs/16/release/vs_buildtools.exe && echo "Downloaded vs_buildtools.exe" && \
		start /wait vs_buildtools.exe --quiet --wait --norestart --nocache --installPath C:/temp/BuildTools --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.ATLMFC --includeRecommended  && echo "Installed VS Build Tools" && \
		del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
282
		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
283
	)
284

285
# Install requirements for Megatron-LM
286
287
megatron_lm:
	cd Megatron && \
288
	python -m pip install --no-cache-dir -r requirements.txt
289

290
# Install requirements for Megatron-DeepSpeed
291
292
megatron_deepspeed:
	cd Megatron && \
293
	python -m pip install --no-cache-dir -r requirements.txt && \
294
	python -m pip install DeepSpeed
295

296
297
298
299
300
301
302
303
304
305
306
307
rocm_megatron_lm:
	cd Megatron && mkdir -p rocm && cd rocm && \
	if [ ! -d "Megatron-LM" ]; then \
		git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \
	fi
	cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
	git clone https://github.com/caaatch22/grouped_gemm.git &&\
    	cd grouped_gemm &&\
    	git checkout 8a9b438 &&\
    	git submodule update --init --recursive &&\
    	pip install .

308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# Instal apex of ROCm due to dependency of Megatron
apex_rocm:
	$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
	$(eval TORCH_MAJOR_VERSION ?= $(word 1,$(subst ., ,$(TORCH_VERSION))))
	$(eval TORCH_MINOR_VERSION ?= $(word 2,$(subst ., ,$(TORCH_VERSION))))
	if [ ! -d "apex" ]; then \
		git clone https://github.com/ROCmSoftwarePlatform/apex.git ; \
	fi
	cd apex && \
	if [ "$$(expr $(TORCH_MAJOR_VERSION) \> 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) \> 1)" -eq 1 ]; then \
		git checkout master ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 1)" -eq 1 ]; then \
		git checkout release/1.1.0 ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
		git checkout release/1.0.0 ; \
	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
	    git checkout release/1.0.0 ; \
	fi
	pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex

328
329
# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
330
331
332
333
334
335
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
	$(eval ARCHS := 100 103)
	if [ -d msccl ]; then rm -rf msccl; fi; \
	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
336
337
338
339
340
341
342
343
344
345
346
    # Get commit 87048bd from msscl to support updated nccl and sm_100
	$(eval ARCHS := 75 80 86 89 90 100)
	if [ -d msccl ]; then rm -rf msccl; fi; \
	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
	$(eval ARCHS := 70 75 80 86 89 90)
else
	$(eval ARCHS := 70 75 80 86")
endif
	$(eval NVCC_GENCODE := "$(foreach arch, $(ARCHS), $(NVCC_GENCODE) -gencode=arch=compute_$(arch),code=sm_$(arch))")
347
348
ifneq (,$(wildcard msccl/executor/msccl-executor-nccl/Makefile))
	cd ./msccl/executor/msccl-executor-nccl && \
349
	make -j ${NUM_MAKE_JOBS} src.build NVCC_GENCODE=$(NVCC_GENCODE) && \
350
351
352
353
354
355
	cd ../../..
	mkdir -p $(SB_MICRO_PATH)/lib/msccl-executor-nccl && \
	cp -r -v ./msccl/executor/msccl-executor-nccl/build/* $(SB_MICRO_PATH)/lib/msccl-executor-nccl/
endif
ifneq (,$(wildcard msccl/scheduler/msccl-scheduler/Makefile))
	cd ./msccl/scheduler/msccl-scheduler && \
356
	CXX=nvcc BIN_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl SRC_HOME=../../../msccl/executor/msccl-executor-nccl make -j ${NUM_MAKE_JOBS} && \
357
358
359
360
361
362
	cd ../../..
	mkdir -p $(SB_MICRO_PATH)/lib/msccl-scheduler && \
	cp -r -v ./msccl/scheduler/msccl-scheduler/build/* $(SB_MICRO_PATH)/lib/msccl-scheduler/
endif
ifneq (,$(wildcard msccl/tests/msccl-tests-nccl/Makefile))
	cd ./msccl/tests/msccl-tests-nccl && \
363
	make MPI=1 MPI_HOME=$(MPI_HOME) NCCL_HOME=$(SB_MICRO_PATH)/lib/msccl-executor-nccl NVCC_GENCODE=$(NVCC_GENCODE) -j ${NUM_MAKE_JOBS} && cd ../../..
364
365
366
	mkdir -p $(SB_MICRO_PATH)/bin/msccl-tests-nccl && \
	cp -r -v ./msccl/tests/msccl-tests-nccl/build/* $(SB_MICRO_PATH)/bin/msccl-tests-nccl/
endif
367
368
369

# Build nvbandwidth.
nvbandwidth: sb_micro_path
370
	cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
371
	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin